# Support Vector Machine


### Import statements 

In [11]:
# Import necessary libraries
import json
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, auc, precision_recall_curve, confusion_matrix
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import confusion_matrix as cm
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

### Load data & K-fold cross validation

In [15]:
SMOTE_merged_df = pd.read_csv('/Users/dionnespaltman/Desktop/V3/train:test:val/new_merged_data.csv', sep=',')
print(SMOTE_merged_df.shape)

(170, 122)


In [14]:
with open('/Users/dionnespaltman/Desktop/V3/columns_au_12.json', 'r') as f:
    columns_au_12 = json.load(f)

print(len(columns_au_12))
# print(columns_au_12)


121


In [16]:
from sklearn.model_selection import train_test_split, StratifiedKFold

X = SMOTE_merged_df.drop('VVR_group', axis=1)  # Features (independent variables)
y = SMOTE_merged_df['VVR_group']  # Target variable (dependent variable)

# Step 1: Split data into training and holdout test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Step 2: Apply 5-fold cross-validation on the training data with stratification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_val, y_train_val)):
    print(f"Fold {fold+1}:")
    X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
    y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]
    
    # Train and validate your model on X_train, y_train, X_val, y_val


Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:


### Featurizer 

In [19]:
featurizer = ColumnTransformer(transformers=[("numeric", StandardScaler(), columns_au_12)], remainder='drop')

### Train 

In [20]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# # Define features and target
# X_train = train.drop('VVR_group', axis=1)
# y_train = train['VVR_group'].values

# print(len(X_train))

# Define and train RandomForest model
model = make_pipeline(featurizer, SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, max_iter=-1, random_state=None))
model.fit(X_train, y_train)

### Hyperparameter tuning using val set - haven't run this yet!

In [22]:
from sklearn.model_selection import GridSearchCV

# # Define features and target for validation set
# X_val = val.drop('VVR_group', axis=1)
# y_val = val['VVR_group'].values

# Define the parameter grid for RandomForest
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    # 'gamma': ['scale', 'auto', 0.1, 0.01],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    # 'degree': [2, 3, 4],  # Degree of the polynomial kernel function ('poly')
    # 'class_weight': [None, 'balanced'],  # Class weight mode
    # 'shrinking': [True, False],  # Whether to use the shrinking heuristic
    # 'probability': [True, False]  # Whether to enable probability estimates
}

# Instantiate the random forest classifier
model_tuned = SVC(random_state=0)

# Create the grid search object
# 5-fold cross validation
grid_search = GridSearchCV(estimator=model_tuned, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator to make predictions
best_model_tuned = grid_search.best_estimator_

# Predict on the validation set using the best estimator
pred_val_tuned = best_model_tuned.predict(X_val)

# Calculate metrics
accuracy_val_tuned = accuracy_score(y_val, pred_val_tuned)
report_val_tuned = classification_report(y_val, pred_val_tuned)

# Print results
print(f"Accuracy on Validation Data (Tuned): {accuracy_val_tuned}")
print("Classification Report (Tuned):")
print(report_val_tuned)


Best parameters found:  {'C': 100}
Accuracy on Validation Data (Tuned): 0.7037037037037037
Classification Report (Tuned):
              precision    recall  f1-score   support

           0       0.73      0.62      0.67        13
           1       0.69      0.79      0.73        14

    accuracy                           0.70        27
   macro avg       0.71      0.70      0.70        27
weighted avg       0.71      0.70      0.70        27



### Evaluate on test set 

In [23]:
# Perform hyperparameter tuning on the validation set
# (GridSearchCV or other hyperparameter tuning technique)
best_params = {}  # Example best parameters

# Train the model using the entire training set with the best parameters
best_model = make_pipeline(featurizer, SVC(**best_params))
best_model.fit(X_train, y_train)

# Evaluate the model on the test set
pred_test = best_model.predict(X_test)
accuracy_test = accuracy_score(y_test, pred_test)
report_test = classification_report(y_test, pred_test)

# Print the results
print(f"Accuracy on Test Data: {accuracy_test}")
print("Classification Report on Test Data:")
print(report_test)

Accuracy on Test Data: 0.8235294117647058
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81        17
           1       0.79      0.88      0.83        17

    accuracy                           0.82        34
   macro avg       0.83      0.82      0.82        34
weighted avg       0.83      0.82      0.82        34

