In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler

# Import Data

In [2]:
data = pd.read_csv('./data/train.csv')
data.drop('id', axis = 1, inplace = True)
X_train = np.asarray(data[data.columns[range(1, data.shape[1])]], dtype = np.double)
y_train = np.asarray(data[['label']], dtype = np.double).ravel()
data.head()

Unnamed: 0,label,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
0,1,7,0,3,0,2,3,0,6,0,...,3,4,2,2,0,13,0,11,1,3
1,1,0,11,0,0,10,1,0,0,4,...,0,2,0,0,2,8,1,13,0,4
2,0,9,0,3,0,1,3,0,4,0,...,48,11,2,0,0,4,0,2,0,0
3,0,0,9,3,2,25,0,4,0,0,...,1,14,1,0,0,0,3,0,17,1
4,0,0,0,0,0,2,5,0,0,0,...,3,12,0,3,0,4,0,24,4,0


In [3]:
data.groupby(['label']).count()

Unnamed: 0_level_0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,327,327,327,327,327,327,327,327,327,327,...,327,327,327,327,327,327,327,327,327,327
1,573,573,573,573,573,573,573,573,573,573,...,573,573,573,573,573,573,573,573,573,573


# Data preprocessing

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train

array([[ 1.67857588, -0.6426044 ,  0.27134247, ..., -0.09405589,
        -0.57018666, -0.29529026],
       [-0.61884494,  1.01109551, -0.70941344, ...,  0.04586196,
        -0.76376191, -0.16160755],
       [ 2.33498182, -0.6426044 ,  0.27134247, ..., -0.7236862 ,
        -0.76376191, -0.6963384 ],
       ...,
       [-0.61884494, -0.6426044 , -0.70941344, ...,  0.32569765,
        -0.76376191,  1.44258501],
       [-0.61884494, -0.34193169, -0.05557617, ...,  0.11582088,
        -0.57018666,  0.10575788],
       [ 1.67857588, -0.6426044 , -0.3824948 , ...,  0.95532796,
        -0.76376191, -0.29529026]])

# Fit final classifier

In [5]:
estimator = SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
  max_iter=-1, probability=True, random_state=0, shrinking=True,
  tol=0.001, verbose=False)
estimator.fit(X_train, y_train)

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False)

# Test data estimation

In [6]:
test = pd.read_csv('./data/test.csv')
X_test = np.asarray(test[test.columns[range(1, test.shape[1])]])
X_test = scaler.transform(X_test)



In [7]:
X_test

array([[ 0.69396695, -0.34193169, -0.05557617, ..., -0.4438505 ,
        -0.3766114 , -0.29529026],
       [ 0.36576398, -0.19159533, -0.70941344, ...,  1.02528689,
        -0.3766114 , -0.6963384 ],
       [-0.61884494, -0.6426044 , -0.70941344, ..., -0.51380943,
        -0.57018666, -0.56265569],
       ...,
       [-0.29064197, -0.49226804, -0.70941344, ...,  1.6549172 ,
        -0.76376191, -0.42897297],
       [-0.29064197, -0.19159533, -0.05557617, ...,  1.58495827,
        -0.57018666, -0.6963384 ],
       [-0.61884494, -0.34193169,  0.27134247, ...,  0.18577981,
        -0.76376191, -0.6963384 ]])

In [8]:
# test_predict = estimator.predict(X_test)
test_predict = estimator.predict_proba(X_test)[:, 1]
print(test_predict)

[0.8696696  0.94450525 0.0468955  ... 0.90329626 0.9484737  0.97563392]


In [9]:
test['label'] = test_predict
test[['id', 'label']].to_csv('submission.csv', sep=',', index=False)