In [2]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from mlxtend.plotting import plot_decision_regions

In [3]:
# Load Data
cs_training = pd.read_csv('../cs-training.csv', index_col = 0)
cs_test = pd.read_csv('../cs-test.csv', index_col = 0)

# Winsorize and fill nan
cs_training = cs_training.apply(lambda x:winsorize(x.fillna(x.mean()), limits = [0.03, 0.03]), axis = 0)
cs_test = cs_test.apply(lambda x:x.fillna(x.mean()), axis = 0)

# split dataset
X_train, X_valid, y_train, y_valid = train_test_split(cs_training.iloc[:, 1:].values, cs_training.iloc[:, 0].values, test_size = 0.3, random_state=0)
X_test = cs_test.iloc[:, 1:].values

In [4]:
# Classification fitting
clf = svm.SVC(C=1,kernel='poly',gamma='auto',decision_function_shape='ovr', max_iter = 500)#设置训练器
 

In [10]:
# Hyperparameters
param_grid = {'C': np.linspace(1e-5, 1, 7).tolist(), 'gamma': np.linspace(1e-2, 100, 7)}

# Pipelines
num_pipeline = Pipeline([('sc', StandardScaler()), 
                         ('pca', PCA(n_components=3)), 
                         ('grid', GridSearchCV(clf, param_grid, cv=5, n_jobs = 8))])
num_pipeline.fit(X_train, y_train)



Pipeline(steps=[('sc', StandardScaler()), ('pca', PCA(n_components=3)),
                ('grid',
                 GridSearchCV(cv=5,
                              estimator=SVC(C=1, gamma='auto', kernel='poly',
                                            max_iter=500),
                              n_jobs=8,
                              param_grid={'C': [1e-05, 0.16667500000000002,
                                                0.33334, 0.500005, 0.66667,
                                                0.833335, 1.0],
                                          'gamma': array([1.0000e-02, 1.6675e+01, 3.3340e+01, 5.0005e+01, 6.6670e+01,
       8.3335e+01, 1.0000e+02])}))])

In [11]:
# Get the optimal SVM and PCA
optimal_SVM = num_pipeline.get_params()['grid'].best_estimator_
pca = num_pipeline.get_params()['pca']
optimal_SVM

SVC(C=0.66667, gamma=83.335, kernel='poly', max_iter=500)

In [12]:
## the prediction precision in validation
valid_result = optimal_SVM.predict(pca.transform(X_valid))
precision = np.sum(valid_result == y_valid)/y_valid.shape[0]## test集合验证
print('The precision in validation dataset:{:.2f}'.format(precision))
test_result = optimal_SVM.predict(pca.transform(X_test))

The precision in validation dataset:0.93


In [8]:
print(test_result)

[0 0 0 ... 0 0 0]
