In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [6]:
data = np.load('./data/data_pca_50_target.npz')
data.files

['arr_0', 'arr_1']

In [7]:
data.allow_pickle = True

In [8]:
X = data['arr_0']
y = data['arr_1']

In [10]:
X

array([[ 0.88065947,  0.25918267, -0.24465702, ...,  0.80859294,
         0.18304974,  1.05614412],
       [-0.61810211, -1.55446747, -0.18449749, ...,  1.328722  ,
        -1.03080867,  1.03053604],
       [-0.78581955, -0.30316201, -0.36871177, ..., -0.56145963,
        -0.93308008, -1.64208298],
       ...,
       [ 1.33932382,  0.57551743,  1.2625425 , ...,  1.84002435,
        -1.40857452, -1.1745881 ],
       [-1.24077215, -0.43844844,  0.41424204, ..., -0.32340776,
        -0.63698337,  0.61143816],
       [ 1.24910431,  0.92800403, -0.07788848, ...,  0.03238445,
         0.3056728 , -0.50657038]])

In [11]:
y

array(['female', 'female', 'female', ..., 'male', 'male', 'male'],
      dtype=object)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
print(x_train.shape, x_test.shape)

(3456, 50) (865, 50)


## Training machine learning model

In [14]:
model_svc = SVC(probability=True)


param_grid = {'C':[0.5,1,10,20,30,50],
             'kernel': ['rbf','poly'],
             'gamma':[0.1,0.05,0.01,0.001,0.002,0.005],
             'coef0':[0,1]}

In [15]:
model_grid = GridSearchCV(model_svc,
                         param_grid = param_grid,
                         scoring = 'accuracy',
                         cv= 3,
                         verbose=2)

In [16]:
model_grid.fit(x_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.2s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.4s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.7s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   2.0s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   2.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.2s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.1s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.0s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.5s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.5s
[CV] END ............C=0.5, coef0=0, gamma=0.0

  _data = np.array(data, dtype=dtype, copy=copy,


In [17]:
model_grid.best_params_

{'C': 1, 'coef0': 0, 'gamma': 0.05, 'kernel': 'rbf'}

In [18]:
model_final = model_grid.best_estimator_

In [19]:
model_final.get_params()

{'C': 1,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.05,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [20]:
y_pred = model_final.predict(x_test)

In [21]:
y_pred

array(['male', 'female', 'female', 'male', 'female', 'female', 'female',
       'male', 'male', 'female', 'female', 'female', 'female', 'male',
       'female', 'female', 'female', 'male', 'female', 'female', 'female',
       'male', 'male', 'male', 'female', 'female', 'female', 'female',
       'male', 'male', 'female', 'male', 'male', 'male', 'female',
       'female', 'female', 'female', 'female', 'male', 'female', 'male',
       'male', 'female', 'female', 'female', 'male', 'male', 'female',
       'male', 'female', 'female', 'male', 'male', 'female', 'female',
       'male', 'male', 'male', 'female', 'female', 'female', 'female',
       'male', 'female', 'male', 'female', 'male', 'male', 'male',
       'female', 'female', 'female', 'female', 'female', 'male', 'female',
       'male', 'female', 'female', 'female', 'female', 'female', 'male',
       'female', 'female', 'female', 'male', 'female', 'female', 'male',
       'male', 'female', 'female', 'female', 'male', 'female', 'femal

### Classification Report

In [22]:
cr = metrics.classification_report(y_test, y_pred, output_dict=True)

In [23]:
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
female,0.783366,0.84728,0.81407,478.0
male,0.79023,0.710594,0.748299,387.0
accuracy,0.786127,0.786127,0.786127,0.786127
macro avg,0.786798,0.778937,0.781185,865.0
weighted avg,0.786437,0.786127,0.784644,865.0


***Kappa Score***

In [24]:
 metrics.cohen_kappa_score(y_test, y_pred)

0.5632779601717142

**Area Under Curve (AUC)**

In [26]:
metrics.roc_auc_score(np.where(y_test=="male",1,0),
                     np.where(y_pred=="male",1,0))

0.7789373249867557

In [27]:
import pickle

In [31]:
with open('./model/model_svm.pickle', 'wb') as file:
    pickle.dump(model_final, file)