## Imports

In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.utils import compute_class_weight

from preprocessor import *
from data import *

In [2]:
## import data when available
data = pd.read_csv('data/wingman_data_proc_v5.csv')
data.set_index('id', inplace=True)

## Train test split

In [3]:
X = data.drop('subcategory_no', axis=1)
y = data['subcategory_no']

In [4]:
# Identify the numerical columns
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns

# Create a StandardScaler object
scaler = RobustScaler()

# Standardize the numerical columns
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## SMOTE

In [6]:
from imblearn.over_sampling import SMOTE

# Assuming your dataset is stored in X and y variables
# X should contain the features, and y should contain the corresponding labels

# Instantiate the SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to the dataset
X_smote, y_smote = smote.fit_resample(X_train, y_train)

## Baseline model

In [7]:
baseline_model = SVC(random_state=42)

baseline_model.fit(X_train, y_train)

In [8]:
y_pred = baseline_model.predict(X_test)

accuracy_score(y_test, y_pred)

0.5454029511918275

## SVClassifier

In [16]:
# Create the SVC model
svc = SVC(kernel = 'linear', gamma = 'scale')

# Define the parameter grid for the randomized search
param_grid = {'C': [0.00001, 0.0000001,0.000001, 0.001]}

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(svc, param_distributions=param_grid,
                                   n_iter=5, cv=3, random_state=42)
random_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Evaluate the model on the test set
accuracy = random_search.score(X_test, y_test)
print("Test Set Accuracy:", accuracy)



Best Parameters: {'C': 1e-05}
Best Score: 0.5305426554982681
Test Set Accuracy: 0.5454029511918275


In [10]:
# from sklearn.model_selection import learning_curve

# # Compute the learning curve
# train_sizes, train_scores, test_scores = learning_curve(random_forest, X, y, 
#                                                         train_sizes=np.linspace(0.1, 1.0, 10), 
#                                                         cv=5, n_jobs=-1, verbose=1)

# # Calculate the mean and standard deviation of train and test scores
# train_mean = np.mean(train_scores, axis=1)
# train_std = np.std(train_scores, axis=1)
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)

# plt.figure(figsize=(10, 6))
# plt.plot(train_sizes, train_mean, label='Training Score', color='blue')
# plt.plot(train_sizes, test_mean, label='Cross-validation Score', color='green')

# # Plot the standard deviation as shaded area
# plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')
# plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

# # Add labels and title
# plt.title('RandomForest Learning Curve')
# plt.xlabel('Training Examples')
# plt.ylabel('Score')
# plt.legend(loc='best')

# # Show the plot
# plt.show()