In [13]:
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

# Import Data

In [14]:
data = pd.read_csv('./data/train.csv')
data.drop('id', axis = 1, inplace = True)
X_train = np.asarray(data[data.columns[range(1, data.shape[1])]], dtype = np.double)
y_train = np.asarray(data[['label']], dtype = np.double).ravel()
data.head()

Unnamed: 0,label,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
0,1,7,0,3,0,2,3,0,6,0,...,3,4,2,2,0,13,0,11,1,3
1,1,0,11,0,0,10,1,0,0,4,...,0,2,0,0,2,8,1,13,0,4
2,0,9,0,3,0,1,3,0,4,0,...,48,11,2,0,0,4,0,2,0,0
3,0,0,9,3,2,25,0,4,0,0,...,1,14,1,0,0,0,3,0,17,1
4,0,0,0,0,0,2,5,0,0,0,...,3,12,0,3,0,4,0,24,4,0


In [15]:
data.groupby(['label']).count()

Unnamed: 0_level_0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,327,327,327,327,327,327,327,327,327,327,...,327,327,327,327,327,327,327,327,327,327
1,573,573,573,573,573,573,573,573,573,573,...,573,573,573,573,573,573,573,573,573,573


# Data preprocessing

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train

array([[ 1.67857588, -0.6426044 ,  0.27134247, ..., -0.09405589,
        -0.57018666, -0.29529026],
       [-0.61884494,  1.01109551, -0.70941344, ...,  0.04586196,
        -0.76376191, -0.16160755],
       [ 2.33498182, -0.6426044 ,  0.27134247, ..., -0.7236862 ,
        -0.76376191, -0.6963384 ],
       ...,
       [-0.61884494, -0.6426044 , -0.70941344, ...,  0.32569765,
        -0.76376191,  1.44258501],
       [-0.61884494, -0.34193169, -0.05557617, ...,  0.11582088,
        -0.57018666,  0.10575788],
       [ 1.67857588, -0.6426044 , -0.3824948 , ...,  0.95532796,
        -0.76376191, -0.29529026]])

In [17]:
# Convert feature matrix into DataFrame
df = pd.DataFrame(X_train)

# View the data frame
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,1.678576,-0.642604,0.271342,-0.670364,-0.255059,-0.183106,-0.517856,0.278361,-0.575394,0.217512,...,-0.224131,-0.326387,0.252271,0.123142,-0.520068,0.400412,-0.426712,-0.094056,-0.570187,-0.295290
1,-0.618845,1.011096,-0.709413,-0.670364,1.180770,-0.566352,-0.517856,-0.842555,0.568404,-0.580859,...,-0.749438,-0.534277,-0.505806,-0.394745,0.673965,-0.079187,0.029393,0.045862,-0.763762,-0.161608
2,2.334982,-0.642604,0.271342,-0.670364,-0.434538,-0.183106,-0.517856,-0.095278,-0.575394,0.377186,...,7.655471,0.401227,0.252271,-0.394745,-0.520068,-0.462866,-0.426712,-0.723686,-0.763762,-0.696338
3,-0.618845,0.710423,0.271342,1.229881,3.872949,-0.757975,1.730976,-0.842555,-0.575394,-0.421185,...,-0.574335,0.713062,-0.126767,-0.394745,-0.520068,-0.846546,0.941603,-0.863604,2.527017,-0.562656
4,-0.618845,-0.642604,-0.709413,-0.670364,-0.255059,0.200140,-0.517856,-0.842555,-0.575394,1.335230,...,-0.224131,0.505172,-0.505806,0.382086,-0.520068,-0.462866,-0.426712,0.815410,0.010539,-0.696338
5,0.037561,1.161432,-0.709413,0.279758,1.180770,-0.374729,-0.517856,-0.842555,2.570051,-0.421185,...,-0.749438,-0.534277,-0.505806,-0.135802,0.673965,-0.654706,-0.426712,-0.653727,-0.570187,1.442585
6,-0.618845,-0.492268,-0.709413,0.279758,-0.434538,0.966632,0.044352,-0.655736,-0.575394,0.377186,...,0.126074,0.089393,-0.505806,0.382086,-0.520068,-0.462866,-0.426712,-0.233974,0.204114,-0.428973
7,-0.290642,-0.191595,-0.709413,-0.670364,-0.434538,-0.374729,-0.517856,0.278361,-0.003495,-0.580859,...,-0.224131,-0.742166,-0.505806,-0.135802,-0.520068,1.647369,-0.426712,2.774260,-0.763762,0.373123
8,3.647794,-0.642604,-0.709413,2.180004,-0.614016,-0.374729,0.606560,-0.655736,-0.575394,1.015882,...,1.176687,-0.430332,0.631310,3.230465,0.076949,-0.462866,-0.426712,-0.443851,-0.570187,-0.562656
9,0.365764,-0.041259,1.252098,-0.670364,0.283377,-0.183106,2.293184,-0.842555,-0.003495,-0.261511,...,-0.399233,-0.014552,-0.126767,-0.394745,0.076949,-0.462866,0.485498,-0.513809,0.978415,-0.562656


In [18]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

In [19]:
# Drop features 
df.drop(df.columns[to_drop], axis=1, inplace=True)

In [20]:
X_train = df.as_matrix()
print(X_train.shape)

(900, 204)


# Testing different models

### SGDClassifier

In [None]:
best_params = {
    'loss': 'squared_hinge', 
    'penalty': 'elasticnet',
    'alpha': 1e-4,
}

In [None]:
clf = SGDClassifier(max_iter=1000, random_state=0, **best_params)

In [None]:
params = {
#     'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 
#              'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
#     'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2],
}

In [None]:
grid_search = GridSearchCV(clf, 
                           params, 
                           scoring='roc_auc', 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=1, 
                           return_train_score=False)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

In [None]:
pd.DataFrame(grid_search.cv_results_)

*Лучший классификатор:* ~0.933, 0.90

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='perceptron', max_iter=5000,
       n_iter=None, n_jobs=1, penalty='elasticnet', power_t=0.5,
       random_state=0, shuffle=True, tol=None, verbose=0, warm_start=False)

### SVC

In [115]:
best_params = {
    'kernel': 'rbf', 
    'C': 0.0001, 
    'gamma': 1,
}

In [116]:
clf = SVC(random_state=0, **best_params)

In [117]:
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': np.geomspace(1e-3, 1e3, 10),
    'C': np.geomspace(1e-3, 1e3, 10),
#     'gamma': np.linspace(0.001, 2, 30),
#     'class_weight': [None, 'balanced'],
}

In [118]:
grid_search = GridSearchCV(clf, 
                           params, 
                           scoring='roc_auc', 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=1, 
                           return_train_score=False)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-2)]: Done  58 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-2)]: Done 358 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-2)]: Done 858 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-2)]: Done 1558 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-2)]: Done 2000 out of 2000 | elapsed:   55.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': array([1.00000e-03, 4.64159e-03, 2.15443e-02, 1.00000e-01, 4.64159e-01,
       2.15443e+00, 1.00000e+01, 4.64159e+01, 2.15443e+02, 1.00000e+03]), 'C': array([1.00000e-03, 4.64159e-03, 2.15443e-02, 1.00000e-01, 4.64159e-01,
       2.15443e+00, 1.00000e+01, 4.64159e+01, 2.15443e+02, 1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='roc_auc', verbose=1)

In [119]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

0.9814463896134378
{'C': 10.0, 'gamma': 0.021544346900318832, 'kernel': 'rbf'}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.021544346900318832,
  kernel='rbf', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [120]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_C,param_gamma,param_kernel,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
0,0.089298,0.017509,0.905703,0.001,0.001,linear,"{'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}",241,0.916733,0.949539,0.925351,0.873009,0.863158,0.003080,0.000535,0.032569
1,0.101155,0.023144,0.914828,0.001,0.001,poly,"{'C': 0.001, 'gamma': 0.001, 'kernel': 'poly'}",216,0.906324,0.929513,0.924281,0.886505,0.927395,0.003650,0.000589,0.016332
2,0.117436,0.026064,0.908518,0.001,0.001,rbf,"{'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'}",236,0.911462,0.942161,0.939666,0.867206,0.881511,0.003462,0.001112,0.030131
3,0.137039,0.034278,0.887522,0.001,0.001,sigmoid,"{'C': 0.001, 'gamma': 0.001, 'kernel': 'sigmoid'}",307,0.896838,0.935573,0.910368,0.853981,0.840081,0.019523,0.007220,0.035462
4,0.082640,0.020739,0.905703,0.001,0.00464159,linear,"{'C': 0.001, 'gamma': 0.004641588833612777, 'k...",241,0.916733,0.949539,0.925351,0.873009,0.863158,0.006465,0.004305,0.032569
5,0.164444,0.037537,0.930901,0.001,0.00464159,poly,"{'C': 0.001, 'gamma': 0.004641588833612777, 'k...",125,0.929117,0.948090,0.930569,0.899730,0.946829,0.022105,0.001208,0.017429
6,0.184971,0.039315,0.921717,0.001,0.00464159,rbf,"{'C': 0.001, 'gamma': 0.004641588833612777, 'k...",153,0.916469,0.945850,0.956254,0.882456,0.907152,0.008947,0.001670,0.026646
7,0.175840,0.039910,0.878900,0.001,0.00464159,sigmoid,"{'C': 0.001, 'gamma': 0.004641588833612777, 'k...",313,0.885507,0.930435,0.902876,0.847233,0.827665,0.000560,0.000407,0.037140
8,0.118858,0.026546,0.905703,0.001,0.0215443,linear,"{'C': 0.001, 'gamma': 0.021544346900318832, 'k...",241,0.916733,0.949539,0.925351,0.873009,0.863158,0.002703,0.000490,0.032569
9,0.171133,0.037244,0.942895,0.001,0.0215443,poly,"{'C': 0.001, 'gamma': 0.021544346900318832, 'k...",119,0.945982,0.957971,0.952776,0.910391,0.947099,0.011377,0.002557,0.016757


*Лучший классификатор*: ~0.972, 

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

### LinearSVC

In [None]:
best_params = { 
    'dual': False,
    'loss': 'squared_hinge',
    'penalty': 'l1',
    'C': 0.06,
}

In [None]:
clf = LinearSVC(**best_params, random_state=0)

In [None]:
params = {
#     'penalty': ['l1', 'l2'],
#     'loss': ['hinge', 'squared_hinge'],
#     'dual': [True, False],
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

In [None]:
grid_search = GridSearchCV(clf, 
                           params, 
                           scoring='roc_auc', 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=1, 
                           return_train_score=False)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

In [None]:
pd.DataFrame(grid_search.cv_results_)

Лучший классификатор: ~0.926,

LinearSVC(C=0.1, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=0, tol=0.0001,
     verbose=0)

### LogisticRegression

In [None]:
best_params = {
    'penalty': 'l2', 
    'C': 0.2,
    'max_iter': 500,
}

In [None]:
clf = LogisticRegression(**best_params, random_state=0)

In [None]:
params = {
#     'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'dual': [True, False],
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

In [None]:
grid_search = GridSearchCV(clf, 
                           params, 
                           scoring='roc_auc', 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=1, 
                           return_train_score=False)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

In [None]:
pd.DataFrame(grid_search.cv_results_)

Лучший классификатор: ~0.930,

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

# Fit final classifier

In [None]:
estimator = SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
  max_iter=-1, probability=True, random_state=0, shrinking=True,
  tol=0.001, verbose=False)
estimator.fit(X_train, y_train)

# Test data estimation

In [None]:
test = pd.read_csv('./data/test.csv')
X_test = np.asarray(test[test.columns[range(1, test.shape[1])]])
X_test = scaler.transform(X_test)

In [None]:
X_test

In [None]:
# test_predict = estimator.predict(X_test)
test_predict = estimator.predict_proba(X_test)[:, 1]
print(test_predict)

In [None]:
test['label'] = test_predict
test[['id', 'label']].to_csv('submit2.csv', sep=',', index=False)