In [52]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC

In [53]:
featuresNames = ['MeanIntegratedProfile',
                'StdIntegratedProfile',
                'ExcessKurtosisIntegratedProfile',
                'SkewnessIntegratedProfile',
                'MeanDMSNRCurve',
                'StdDMSNRCurve',
                'ExcessKurtosisDMSNRCurve',
                'SkewnessDMSNRCurve',
                'Class']

In [54]:
data = pd.read_csv('../HTRU_2.csv',
                   header = None, 
                   names = featuresNames)

In [55]:
x = data.drop('Class', axis = 1, inplace = False)
y = data['Class']

In [56]:
normalized_x = scale(x)

In [57]:
x_train, x_test, y_train, y_test = train_test_split(normalized_x,
                                                    y,
                                                    test_size = 0.1,
                                                    random_state = 0,
                                                    stratify = y)

In [58]:
grid_param = {  
    'C': [0.1, 1, 100],
    'gamma': [0.1, 0.01, 0.001]
}

grid_search = GridSearchCV(estimator = SVC(kernel = 'rbf'),
                           param_grid = grid_param,
                           scoring = 'f1',
                           cv = 5)

grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 100], 'gamma': [0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [59]:
best_parameters = grid_search.best_params_  
print(best_parameters)

{'C': 100, 'gamma': 0.1}


In [60]:
best_result = grid_search.best_score_  
print(best_result)  

0.8836991688913368


In [62]:
print('# Tuning hyper-parameters for f1')
print()

print('Best parameters set found on development set:')
print()
print(grid_search.best_params_)
print()
print('Grid scores on development set:')
print()
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print('%0.4f (+/-%0.04f) for %r'
          % (mean, std * 2, params))

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 100, 'gamma': 0.1}

Grid scores on development set:

0.8590 (+/-0.0319) for {'C': 0.1, 'gamma': 0.1}
0.8279 (+/-0.0291) for {'C': 0.1, 'gamma': 0.01}
0.7135 (+/-0.0264) for {'C': 0.1, 'gamma': 0.001}
0.8698 (+/-0.0269) for {'C': 1, 'gamma': 0.1}
0.8575 (+/-0.0290) for {'C': 1, 'gamma': 0.01}
0.8282 (+/-0.0316) for {'C': 1, 'gamma': 0.001}
0.8837 (+/-0.0247) for {'C': 100, 'gamma': 0.1}
0.8758 (+/-0.0303) for {'C': 100, 'gamma': 0.01}
0.8734 (+/-0.0259) for {'C': 100, 'gamma': 0.001}


In [63]:
feature_importance_indexes = [2, 3, 5, 6, 0, 4, 7, 1]
best_model = None
best_features = None
best_score = 0

for n_features in range(len(feature_importance_indexes) + 1, 1, -1):
    x_train_features = x_train[:, feature_importance_indexes[0:n_features]]
    
    grid_search = GridSearchCV(estimator = SVC(kernel = 'rbf'),
                           param_grid = grid_param,
                           scoring = 'f1',
                           cv = 5)

    grid_search.fit(x_train_features, y_train)
    score_val = grid_search.best_score_  
    
    if score_val > best_score:
        best_score = score_val
        best_features = feature_importance_indexes[0:n_features]
        best_model = grid_search

In [64]:
print('# Tuning hyper-parameters for f1')
print()

print('Best parameters set found on development set:')
print()
print(best_features)
print()
print('Features used on development set:')
print()
print(best_model.best_params_)
print()
print('Grid scores on development set:')
print()
means = best_model.cv_results_['mean_test_score']
stds = best_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_model.cv_results_['params']):
    print('%0.4f (+/-%0.04f) for %r'
          % (mean, std * 2, params))

# Tuning hyper-parameters for f1

Best parameters set found on development set:

[2, 3, 5, 6, 0, 4, 7, 1]

Features used on development set:

{'C': 100, 'gamma': 0.1}

Grid scores on development set:

0.8590 (+/-0.0319) for {'C': 0.1, 'gamma': 0.1}
0.8279 (+/-0.0291) for {'C': 0.1, 'gamma': 0.01}
0.7135 (+/-0.0264) for {'C': 0.1, 'gamma': 0.001}
0.8698 (+/-0.0269) for {'C': 1, 'gamma': 0.1}
0.8575 (+/-0.0290) for {'C': 1, 'gamma': 0.01}
0.8282 (+/-0.0316) for {'C': 1, 'gamma': 0.001}
0.8837 (+/-0.0247) for {'C': 100, 'gamma': 0.1}
0.8758 (+/-0.0303) for {'C': 100, 'gamma': 0.01}
0.8734 (+/-0.0259) for {'C': 100, 'gamma': 0.001}
