### 2. Machine learning model
---

In [19]:
# libraries 
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.preprocessing import StandardScaler

# datasets
train_data = pd.read_csv('../data/processed/train_data.csv')
test_data = pd.read_csv('../data/processed/test_data.csv')

# split X and Y
X_train = train_data['features']
X_test = test_data['features']
y_train = train_data['outcome']
y_test = test_data['outcome']

# transform x to an array
X_train = np.array([np.array(literal_eval(features)) for features in X_train])
X_test = np.array([np.array(literal_eval(features)) for features in X_test])

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



***2.1. inizialize the model***

In [20]:
from sklearn.svm import SVC

model = SVC(kernel = "linear", random_state = 42)
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

In [22]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.935

***2.2. optimize the model***

In [28]:
from sklearn.model_selection import GridSearchCV
model = SVC()
param_grid = {
    'C': [0.1, 1, 2, 4, 6, 10, 20],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.01, 0.1, 0.2, 0.5, 1]}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3)


In [29]:
grid_search.fit(X_train, y_train)

In [30]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [31]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best Model Accuracy:", accuracy)

Best Model Accuracy: 0.9316666666666666


***2.3. Save the model***

In [27]:
from pickle import dump

dump(model, open("../models/svm_classifier_linear_42.sav", "wb"))

---

## Conclusions

The model count with an accuracy of  93,50%.

The model optimization lowered the score to 93,16%.

The original model is considered to have a very high accuracy, and it will be saved without optimization.