In [116]:
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler

# Import Data

In [2]:
data = pd.read_csv('./data/train.csv')
data.drop('id', axis = 1, inplace = True)
X_train = np.asarray(data[data.columns[range(1, data.shape[1])]], dtype = np.double)
y_train = np.asarray(data[['label']], dtype = np.double).ravel()
data.head()

Unnamed: 0,label,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
0,1,7,0,3,0,2,3,0,6,0,...,3,4,2,2,0,13,0,11,1,3
1,1,0,11,0,0,10,1,0,0,4,...,0,2,0,0,2,8,1,13,0,4
2,0,9,0,3,0,1,3,0,4,0,...,48,11,2,0,0,4,0,2,0,0
3,0,0,9,3,2,25,0,4,0,0,...,1,14,1,0,0,0,3,0,17,1
4,0,0,0,0,0,2,5,0,0,0,...,3,12,0,3,0,4,0,24,4,0


In [3]:
data.groupby(['label']).count()

Unnamed: 0_level_0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,327,327,327,327,327,327,327,327,327,327,...,327,327,327,327,327,327,327,327,327,327
1,573,573,573,573,573,573,573,573,573,573,...,573,573,573,573,573,573,573,573,573,573


# Data preprocessing

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train

array([[ 1.67857588, -0.6426044 ,  0.27134247, ..., -0.09405589,
        -0.57018666, -0.29529026],
       [-0.61884494,  1.01109551, -0.70941344, ...,  0.04586196,
        -0.76376191, -0.16160755],
       [ 2.33498182, -0.6426044 ,  0.27134247, ..., -0.7236862 ,
        -0.76376191, -0.6963384 ],
       ...,
       [-0.61884494, -0.6426044 , -0.70941344, ...,  0.32569765,
        -0.76376191,  1.44258501],
       [-0.61884494, -0.34193169, -0.05557617, ...,  0.11582088,
        -0.57018666,  0.10575788],
       [ 1.67857588, -0.6426044 , -0.3824948 , ...,  0.95532796,
        -0.76376191, -0.29529026]])

Лучший классификатор: ~0.930,

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

### Pipeline

In [157]:
best_params = {
    'kernel': 'poly', 
    'C': 0.1, 
    'gamma': 1,
}

In [163]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(C=1, penalty="l1", dual=False, random_state=0))),
  ('classification', SVC(random_state=0, **best_params))
])

In [164]:
params = {
    'classification__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'classification__C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

In [165]:
grid_search = GridSearchCV(clf, 
                           params, 
                           scoring='roc_auc', 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=1, 
                           return_train_score=False)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-2)]: Done 140 out of 140 | elapsed:   19.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=0, tol=0.0001,
     verbose=0),
        norm_order=1, prefit=Fals...ly',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'classification__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'classification__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='roc_auc', verbose=1)

In [166]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

0.966972249040899
{'classification__C': 0.0001, 'classification__kernel': 'poly'}


Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=0, tol=0.0001,
     verbose=0),
        norm_order=1, prefit=Fals...ly',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))])

In [167]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_classification__C,param_classification__kernel,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
0,0.708379,0.026415,0.887354,0.0001,linear,"{'classification__C': 0.0001, 'classification_...",20,0.895257,0.940184,0.909164,0.853306,0.838057,0.1671,0.002928,0.037188
1,0.915569,0.021038,0.966972,0.0001,poly,"{'classification__C': 0.0001, 'classification_...",1,0.963241,0.977997,0.967224,0.953036,0.973279,0.063773,0.001776,0.00859
2,0.941885,0.032538,0.87148,0.0001,rbf,"{'classification__C': 0.0001, 'classification_...",21,0.843478,0.889723,0.881739,0.853036,0.889474,0.154568,0.002591,0.019435
3,0.946769,0.033252,0.858135,0.0001,sigmoid,"{'classification__C': 0.0001, 'classification_...",23,0.860079,0.90527,0.884281,0.8278,0.812551,0.158448,0.002832,0.03436
4,0.904604,0.021794,0.903488,0.001,linear,"{'classification__C': 0.001, 'classification__...",13,0.915415,0.947036,0.919732,0.865452,0.869096,0.157014,0.001322,0.031397
5,1.027612,0.019846,0.966972,0.001,poly,"{'classification__C': 0.001, 'classification__...",1,0.963241,0.977997,0.967224,0.953036,0.973279,0.13522,0.001116,0.00859
6,1.029029,0.036724,0.896839,0.001,rbf,"{'classification__C': 0.001, 'classification__...",17,0.867391,0.909289,0.90087,0.899798,0.907018,0.030986,0.002813,0.015204
7,0.922882,0.03226,0.85925,0.001,sigmoid,"{'classification__C': 0.001, 'classification__...",22,0.863768,0.906324,0.883746,0.827935,0.813765,0.144018,0.000951,0.034297
8,0.80943,0.015294,0.920945,0.01,linear,"{'classification__C': 0.01, 'classification__k...",8,0.94058,0.945718,0.932977,0.890958,0.893927,0.176139,0.001186,0.023536
9,1.012052,0.020596,0.966972,0.01,poly,"{'classification__C': 0.01, 'classification__k...",1,0.963241,0.977997,0.967224,0.953036,0.973279,0.031802,0.001207,0.00859


# Fit final classifier

In [32]:
estimator = SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
  max_iter=-1, probability=True, random_state=0, shrinking=True,
  tol=0.001, verbose=False)
estimator.fit(X_train, y_train)

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False)

# Test data estimation

In [33]:
test = pd.read_csv('./data/test.csv')
X_test = np.asarray(test[test.columns[range(1, test.shape[1])]])
X_test = scaler.transform(X_test)



In [34]:
X_test

array([[ 0.69396695, -0.34193169, -0.05557617, ..., -0.4438505 ,
        -0.3766114 , -0.29529026],
       [ 0.36576398, -0.19159533, -0.70941344, ...,  1.02528689,
        -0.3766114 , -0.6963384 ],
       [-0.61884494, -0.6426044 , -0.70941344, ..., -0.51380943,
        -0.57018666, -0.56265569],
       ...,
       [-0.29064197, -0.49226804, -0.70941344, ...,  1.6549172 ,
        -0.76376191, -0.42897297],
       [-0.29064197, -0.19159533, -0.05557617, ...,  1.58495827,
        -0.57018666, -0.6963384 ],
       [-0.61884494, -0.34193169,  0.27134247, ...,  0.18577981,
        -0.76376191, -0.6963384 ]])

In [35]:
# test_predict = estimator.predict(X_test)
test_predict = estimator.predict_proba(X_test)[:, 1]
print(test_predict)

[0.8696696  0.94450525 0.0468955  ... 0.90329626 0.9484737  0.97563392]


In [36]:
test['label'] = test_predict
test[['id', 'label']].to_csv('submit2.csv', sep=',', index=False)