# SVC optimization

In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_parallel_coordinate
import plotly.express as px
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
train_df = pd.read_feather('./data/train_processed.ftr')
val_df = pd.read_feather('./data/val_processed.ftr')
cols = list(train_df.columns)

## Prepare SVC model

In [6]:
train_x = train_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].to_numpy()
val_x = val_df[cols[:-1]].to_numpy()
val_y = val_df[cols[-1]].to_numpy()

In [7]:
class ModelTester:
    
    def __init__(
        self,
        params: dict,
        train_x: np.array,
        train_y: np.array,
        val_x: np.array,
        val_y: np.array
    ):
        self.params = params
        self.train_x = train_x
        self.train_y = train_y
        self.val_x = val_x
        self.val_y = val_y
        
        self.model = self.train_model()
        self.y_pred_bin = self.get_bin_predictions()
            
    def train_model(self):
        # 1 - Run model
        model = SVC(
            C=self.params['C'],
            kernel=self.params['kernel'],
            degree=self.params['degree'],
            gamma=self.params['gamma'],
            coef0=self.params['coef0'],
            shrinking=self.params['shrinking'],
            tol=self.params['tol']
        )
        model.fit(self.train_x, self.train_y)
        return model
    
    def get_bin_predictions(self):
        # 2 - Get predictions
        y_pred = self.model.predict(val_x)
        y_pred_bin = [1 if y else 0 for y in y_pred]
        
        return y_pred_bin
    
    def get_accuracy(self):
        return round(accuracy_score(self.val_y, self.y_pred_bin), 3)
    
    def get_class_report(self):
        return classification_report(self.val_y, self.y_pred_bin)    

## Optuna research

In [8]:
def objective(trial):
    # 0 - Prepare params
    C = trial.suggest_float('C', 0.001, 100.)

    
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid'])
    
    degree = trial.suggest_int("degree", 2, 10)
    gamma= trial.suggest_categorical('gamma', ['scale', 'auto'])
    coef0 = trial.suggest_float('coef0', 0.001, 10.)
    shrinking = bool(trial.suggest_int('shrinking', 0, 2))
    tol = trial.suggest_float('tol', 0.001, 1)
    
    params = {
        'C': C,
        'kernel': kernel,
        'degree': degree,
        'gamma': gamma,
        'coef0': coef0,
        'shrinking': shrinking,
        'tol': tol,
    }
    
    # 1 - Model
    model_tester = ModelTester(params, train_x, train_y, val_x, val_y)
    acc = model_tester.get_accuracy()
    error = 1 - acc
    
    return error

In [12]:
study = optuna.create_study()
study.optimize(objective, n_trials=1000)

[32m[I 2022-07-02 09:50:13,799][0m A new study created in memory with name: no-name-84e2076b-d183-4961-a480-4a952adba4e6[0m
[32m[I 2022-07-02 09:50:21,536][0m Trial 0 finished with value: 0.21799999999999997 and parameters: {'C': 38.98226574904255, 'kernel': 'rbf', 'degree': 4, 'gamma': 'scale', 'coef0': 3.9127878415083077, 'shrinking': 0, 'tol': 0.22422958597357504}. Best is trial 0 with value: 0.21799999999999997.[0m
[32m[I 2022-07-02 09:50:30,790][0m Trial 1 finished with value: 0.52 and parameters: {'C': 26.328655904605405, 'kernel': 'sigmoid', 'degree': 7, 'gamma': 'scale', 'coef0': 8.834139889216967, 'shrinking': 2, 'tol': 0.5171553753042297}. Best is trial 0 with value: 0.21799999999999997.[0m
[32m[I 2022-07-02 09:50:35,290][0m Trial 2 finished with value: 0.22899999999999998 and parameters: {'C': 2.41697043504757, 'kernel': 'linear', 'degree': 10, 'gamma': 'scale', 'coef0': 6.555643199015921, 'shrinking': 1, 'tol': 0.9149308282896379}. Best is trial 0 with value: 0.2

KeyboardInterrupt: 

### Show results

In [13]:
plot_optimization_history(study)

In [14]:
plot_param_importances(study)

In [15]:
key_params = ['kernel', 'coef0', 'degree']
plot_slice(study, params = key_params).show()

In [16]:
study.best_params

{'C': 55.269088337763066,
 'kernel': 'rbf',
 'degree': 10,
 'gamma': 'auto',
 'coef0': 8.363572073854568,
 'shrinking': 1,
 'tol': 0.5608852633036548}

In [9]:
best_params = {
    'C': 55.269088337763066,
    'kernel': 'rbf',
    'degree': 10,
    'gamma': 'auto',
    'coef0': 8.363572073854568,
    'shrinking': 1,
    'tol': 0.5608852633036548
 }

## Create model with best params

In [10]:
train_x = train_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].to_numpy()
val_x = val_df[cols[:-1]].to_numpy()
val_y = val_df[cols[-1]].to_numpy()

In [11]:
params = best_params
model = SVC(
    C=params['C'],
    kernel=params['kernel'],
    degree=params['degree'],
    gamma=params['gamma'],
    coef0=params['coef0'],
    shrinking=params['shrinking'],
    tol=params['tol']
)

In [12]:
X = pd.concat([train_df[cols[:-1]], val_df[cols[:-1]]])
y = pd.concat([train_df[cols[-1]], val_df[cols[-1]]])

In [16]:
cv_scores = cross_val_score(model, X, y, cv=10)
print(f'Mean cv (k=10) accuracy score = {round(np.mean(cv_scores), 3)}')

Mean cv (k=10) accuracy score = 0.791


## Feature selection

In [17]:
is_cat_col = {
    'is_g734s': True,
    'CryoSleep': True,
    'VIP': True,
    'Europa': True,
    'Mars': True,
    'PSO J318.5-22': True,
    'TRAPPIST-1e': True,
    'Age': False,
    'RoomService': False,
    'FoodCourt': False,
    'ShoppingMall': False,
    'Spa': False,
    'VRDeck': False,
    'segment': True
 }

In [18]:
feature_mi = mutual_info_classif(X, y, discrete_features=list(is_cat_col.values()))
feature_f = f_classif(X, y)

In [19]:
feature_df = pd.DataFrame({
    'feature': cols[:-1],
    'mi': feature_mi,
    'f': feature_f[0]
})

scaler = StandardScaler()
feature_df[['mi_z', 'f_z']] = scaler.fit_transform(feature_df[['mi', 'f']])
feature_df['feature_quality'] = feature_df[['mi_z', 'f_z']].mean(axis=1)

feature_df.sort_values('feature_quality', ascending=False, inplace=True)
best_features = feature_df['feature'].to_list()
feature_df

Unnamed: 0,feature,mi,f,mi_z,f_z,feature_quality
1,CryoSleep,0.1072555,2228.334929,1.758079,2.196114,1.977097
13,segment,0.1066814,1800.139509,1.743288,1.61608,1.679684
11,Spa,0.07617253,1285.480254,0.957209,0.918922,0.938065
8,RoomService,0.07764053,1239.221137,0.995033,0.85626,0.925646
12,VRDeck,0.06039245,1101.496913,0.550625,0.669698,0.610162
10,ShoppingMall,0.0450129,267.552878,0.154362,-0.459963,-0.152801
9,FoodCourt,0.04011436,151.085174,0.028148,-0.61773,-0.294791
3,Europa,0.01587072,280.812746,-0.596504,-0.442001,-0.519253
7,Age,0.01161459,48.491713,-0.706166,-0.756703,-0.731435
6,TRAPPIST-1e,0.004653416,81.384065,-0.885525,-0.712147,-0.798836


In [20]:
fig = px.bar(feature_df, x='feature_quality', y='feature')
fig.show()

In [23]:
temp_cols = []
cv_list = []

model = SVC(
    C=params['C'],
    kernel=params['kernel'],
    degree=params['degree'],
    gamma=params['gamma'],
    coef0=params['coef0'],
    shrinking=params['shrinking'],
    tol=params['tol']
)

for feature in best_features:
    temp_cols.append(feature)
    X = pd.concat([train_df[temp_cols], val_df[temp_cols]]).to_numpy()
    cv_score = cross_val_score(model, X, y, cv=10)
    cv_list.append(cv_score)

In [24]:
means = []
for scores in cv_list:
    means.append(round(np.mean(scores), 3))

i_features = 0
for i in range(len(means)):
    if means[i] == max(means):
        i_features = i
        print(i_features)
        break

8


In [25]:
best_features[:i_features+1]

['CryoSleep',
 'segment',
 'Spa',
 'RoomService',
 'VRDeck',
 'ShoppingMall',
 'FoodCourt',
 'Europa',
 'Age']

## Make final model

In [26]:
train_x = train_df[best_features[:i_features+1]].to_numpy()
val_x = val_df[best_features[:i_features+1]].to_numpy()

In [27]:
params = best_params
model = SVC(
    C=params['C'],
    kernel=params['kernel'],
    degree=params['degree'],
    gamma=params['gamma'],
    coef0=params['coef0'],
    shrinking=params['shrinking'],
    tol=params['tol'],
    probability=True
)

In [28]:
model.fit(train_x, train_y)
pred_val = model.predict_proba(val_x)
pred_train = model.predict_proba(train_x)

In [29]:
train_proba_df = pd.DataFrame(pred_train)
train_proba_df.columns = ['0', 'svc']
train_proba_df['y'] = train_y.astype(int)
train_proba_df.drop(columns='0', inplace=True)
train_proba_df

Unnamed: 0,svc,y
0,0.789785,0
1,0.751823,0
2,0.121129,0
3,0.500000,1
4,0.154317,0
...,...,...
6996,0.355281,0
6997,0.186197,0
6998,0.839701,1
6999,0.238495,0


In [30]:
val_proba_df = pd.DataFrame(pred_val)
val_proba_df.columns = ['0', 'svc']
val_proba_df['y'] = val_y.astype(int)
val_proba_df.drop(columns='0', inplace=True)
val_proba_df

Unnamed: 0,svc,y
0,0.300525,1
1,0.742213,0
2,0.545838,1
3,0.141895,0
4,0.773805,1
...,...,...
1687,0.144930,0
1688,0.811434,1
1689,0.765861,1
1690,0.500000,1


In [31]:
train_proba_final = pd.read_csv('./data/ensemble_train_df.csv')
train_proba_final['svc'] = train_proba_df['svc']
val_proba_final = pd.read_csv('./data/ensemble_val_df.csv')
val_proba_final['svc'] = val_proba_df['svc']

In [32]:
train_proba_final.to_csv('./data/ensemble_train_df.csv')
val_proba_final.to_csv('./data/ensemble_val_df.csv')

In [None]:
filepath = os.path.join('models', 'svc.pickle')
pickle.dump(model, open(filepath, 'wb'))