In [1]:
import pandas as pd
import numpy as np

from os.path import join as path_join
from os import cpu_count

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import \
    (train_test_split, GridSearchCV, StratifiedKFold, cross_val_score)
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

### Roc_auc_score on
 - Validation: 0.7245855337239318
 - Pseudo_test: 0.6623403883882553
 - leaderbord: 0.66429222

In [2]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'new_train.csv'))
test = pd.read_csv(path_join(CSV_DIR, 'new_test.csv'))

In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,336,337,338,339,340,341,342,343,344,345
0,0,,1,0,0,1,0,0,0.136364,0,...,0.192984,0,1,0,0,0.222222,1,1,1,1
1,1,,1,0,0,1,0,0,0.181818,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
2,2,,1,0,0,0,0,0,0.090909,0,...,0.192984,0,1,0,0,0.222222,1,1,1,0
3,3,,1,0,0,1,0,0,0.090909,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
4,4,,1,0,0,1,0,0,0.090909,0,...,0.289893,0,0,1,0,0.0,1,1,1,1


In [4]:
X_train, Y_train = train.drop(columns=['0']), train['0']
X_test, Y_test = test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [5]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [6]:
n_jobs = max(cpu_count()-1, 1)

lr = LogisticRegression(class_weight='balanced')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {
    'C': [0.0001, 0.005, 0.001, 0.01, 1, 5, 10, 20, 30, 50]
}

gs = GridSearchCV(
    estimator=lr,
    param_grid=params,
    cv=skf,
    scoring='roc_auc',
    n_jobs=n_jobs,
)

gs.fit(X=x_train, y=y_train)

best_score = gs.best_score_
best_model = gs.best_estimator_

print(best_score)

0.7245855337239318


In [7]:
lr = LogisticRegression(class_weight='balanced', C=25)

scores_lr = cross_val_score(
    estimator=lr,
    X=x_train,
    y=y_train,
    scoring='roc_auc',
    cv=skf,
    n_jobs=n_jobs
).mean()
print('LR scoring: {:.5f}'.format(scores_lr))

LR scoring: 0.72238


In [8]:
best_model

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [9]:
prediction = best_model.predict_proba(x_val)

print(roc_auc_score(y_val, prediction))

ValueError: bad input shape (10065, 2)

In [None]:
Y_test['0'] = best_model.predict_proba(X_test)

In [None]:
Y_test.head()

In [None]:
Y_test['0'].value_counts()

In [None]:
Y_test.to_csv(path_join(CSV_DIR, 'submission_LogisticRegression.csv'), index=False)