# SVC optimization

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_parallel_coordinate
import plotly.express as px
import os
import pickle

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
train_df = pd.read_feather('./data/train_processed.ftr')
val_df = pd.read_feather('./data/val_processed.ftr')
cols = list(train_df.columns)

## Prepare SVC model

In [None]:
train_x = train_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].to_numpy()
val_x = val_df[cols[:-1]].to_numpy()
val_y = val_df[cols[-1]].to_numpy()

In [None]:
class ModelTester:
    
    def __init__(
        self,
        params: dict,
        train_x: np.array,
        train_y: np.array,
        val_x: np.array,
        val_y: np.array
    ):
        self.params = params
        self.train_x = train_x
        self.train_y = train_y
        self.val_x = val_x
        self.val_y = val_y
        
        self.model = self.train_model()
        self.y_pred_bin = self.get_bin_predictions()
            
    def train_model(self):
        # 1 - Run model
        model = SVC(
            C=self.params['C'],
            kernel=self.params['kernel'],
            degree=self.params['degree'],
            gamma=self.params['gamma'],
            coef0=self.params['coef0'],
            shrinking=self.params['shrinking'],
            tol=self.params['tol']
        )
        model.fit(self.train_x, self.train_y)
        return model
    
    def get_bin_predictions(self):
        # 2 - Get predictions
        y_pred = self.model.predict(val_x)
        y_pred_bin = [1 if y else 0 for y in y_pred]
        
        return y_pred_bin
    
    def get_accuracy(self):
        return round(accuracy_score(self.val_y, self.y_pred_bin), 3)
    
    def get_class_report(self):
        return classification_report(self.val_y, self.y_pred_bin)    

## Optuna research

In [None]:
def objective(trial):
    # 0 - Prepare params
    C = trial.suggest_float('C', 0.001, 100.)

    
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid'])
    
    degree = trial.suggest_int("degree", 2, 10)
    gamma= trial.suggest_categorical('gamma', ['scale', 'auto'])
    coef0 = trial.suggest_float('coef0', 0.001, 10.)
    shrinking = bool(trial.suggest_int('shrinking', 0, 2))
    tol = trial.suggest_float('tol', 0.001, 1)
    
    params = {
        'C': C,
        'kernel': kernel,
        'degree': degree,
        'gamma': gamma,
        'coef0': coef0,
        'shrinking': shrinking,
        'tol': tol,
    }
    
    # 1 - Model
    model_tester = ModelTester(params, train_x, train_y, val_x, val_y)
    acc = model_tester.get_accuracy()
    error = 1 - acc
    
    return error

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=1000)

### Show results

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
key_params = ['kernel', 'coef0', 'degree']
plot_slice(study, params = key_params).show()

In [None]:
study.best_params

In [None]:
best_params = {
    'C': 55.269088337763066,
    'kernel': 'rbf',
    'degree': 10,
    'gamma': 'auto',
    'coef0': 8.363572073854568,
    'shrinking': 1,
    'tol': 0.5608852633036548
 }

## Create model with best params

In [None]:
train_x = train_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].to_numpy()
val_x = val_df[cols[:-1]].to_numpy()
val_y = val_df[cols[-1]].to_numpy()

In [None]:
params = best_params
model = SVC(
    C=params['C'],
    kernel=params['kernel'],
    degree=params['degree'],
    gamma=params['gamma'],
    coef0=params['coef0'],
    shrinking=params['shrinking'],
    tol=params['tol']
)

In [None]:
X = pd.concat([train_df[cols[:-1]], val_df[cols[:-1]]])
y = pd.concat([train_df[cols[-1]], val_df[cols[-1]]])

In [None]:
cv_scores = cross_val_score(model, X, y, cv=10)
print(f'Mean cv (k=10) accuracy score = {round(np.mean(cv_scores), 3)}')

## Feature selection

In [None]:
is_cat_col = {
    'is_g734s': True,
    'CryoSleep': True,
    'VIP': True,
    'Europa': True,
    'Mars': True,
    'PSO J318.5-22': True,
    'TRAPPIST-1e': True,
    'Age': False,
    'RoomService': False,
    'FoodCourt': False,
    'ShoppingMall': False,
    'Spa': False,
    'VRDeck': False,
    'segment': True
 }

In [None]:
feature_mi = mutual_info_classif(X, y, discrete_features=list(is_cat_col.values()))
feature_f = f_classif(X, y)

In [None]:
feature_df = pd.DataFrame({
    'feature': cols[:-1],
    'mi': feature_mi,
    'f': feature_f[0]
})

scaler = StandardScaler()
feature_df[['mi_z', 'f_z']] = scaler.fit_transform(feature_df[['mi', 'f']])
feature_df['feature_quality'] = feature_df[['mi_z', 'f_z']].mean(axis=1)

feature_df.sort_values('feature_quality', ascending=False, inplace=True)
best_features = feature_df['feature'].to_list()
feature_df

In [None]:
fig = px.bar(feature_df, x='feature_quality', y='feature')
fig.show()

In [None]:
temp_cols = []
cv_list = []

model = SVC(
    C=params['C'],
    kernel=params['kernel'],
    degree=params['degree'],
    gamma=params['gamma'],
    coef0=params['coef0'],
    shrinking=params['shrinking'],
    tol=params['tol']
)

for feature in best_features:
    temp_cols.append(feature)
    X = pd.concat([train_df[temp_cols], val_df[temp_cols]]).to_numpy()
    cv_score = cross_val_score(model, X, y, cv=10)
    cv_list.append(cv_score)

In [None]:
means = []
for scores in cv_list:
    means.append(round(np.mean(scores), 3))

i_features = 0
for i in range(len(means)):
    if means[i] == max(means):
        i_features = i
        print(i_features)
        break

In [None]:
best_features[:i_features+1]

## Make final model

In [None]:
train_x = train_df[best_features[:i_features+1]].to_numpy()
val_x = val_df[best_features[:i_features+1]].to_numpy()

In [None]:
params = best_params
model = SVC(
    C=params['C'],
    kernel=params['kernel'],
    degree=params['degree'],
    gamma=params['gamma'],
    coef0=params['coef0'],
    shrinking=params['shrinking'],
    tol=params['tol'],
    probability=True
)

In [None]:
model.fit(train_x, train_y)
pred_val = model.predict_proba(val_x)
pred_train = model.predict_proba(train_x)

In [None]:
model_dict = {
    'model': model,
    'features': best_features
}

filepath = os.path.join('models', 'svc.pickle')
with open(filepath, 'wb') as file:
    pickle.dump(model_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_proba_df = pd.DataFrame(pred_train)
train_proba_df.columns = ['0', 'svc']
train_proba_df['y'] = train_y.astype(int)
train_proba_df.drop(columns='0', inplace=True)
train_proba_df

In [None]:
val_proba_df = pd.DataFrame(pred_val)
val_proba_df.columns = ['0', 'svc']
val_proba_df['y'] = val_y.astype(int)
val_proba_df.drop(columns='0', inplace=True)
val_proba_df

In [None]:
train_proba_final = pd.read_csv('./data/ensemble_train_df.csv')
train_proba_final['svc'] = train_proba_df['svc']
val_proba_final = pd.read_csv('./data/ensemble_val_df.csv')
val_proba_final['svc'] = val_proba_df['svc']

In [None]:
train_proba_final.to_csv('./data/ensemble_train_df.csv')
val_proba_final.to_csv('./data/ensemble_val_df.csv')

In [None]:
filepath = os.path.join('models', 'svc.pickle')
pickle.dump(model, open(filepath, 'wb'))