## Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.utils import compute_class_weight


from preprocessor import *
from data import *

In [3]:
## import data when available
data = pd.read_csv('data/wingman_data_proc_v5.csv')
data.set_index('id', inplace=True)

In [4]:
data.shape

(5871, 58)

In [5]:
X = data.drop('subcategory_no', axis=1)
y = data['subcategory_no']

## Train test split

In [6]:
# mapping y values
value_map = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}

# Map new values onto the array using NumPy's 'vectorize' function
y_mapped = np.vectorize(value_map.get)(y)

print("Original Array:", y)
print("Mapped Array:", y_mapped)


Original Array: id
20080115X000511    6
20080116X000631    3
20080122X000871    5
20080220X002121    6
20080207X001531    4
                  ..
20190107X337411    2
20190121X128521    7
20190108X549451    4
20190112X112141    6
20190111X427051    6
Name: subcategory_no, Length: 5871, dtype: int64
Mapped Array: [5 2 4 ... 3 5 5]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, random_state=1)

## SMOTE

In [11]:
from imblearn.over_sampling import SMOTE

# Assuming your dataset is stored in X and y variables
# X should contain the features, and y should contain the corresponding labels

# Instantiate the SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to the dataset
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [10]:
def calculate_class_weights(y):
    """
    Calculates the balanced class weights for a multiclass classification problem.
    Parameters:
    - y: array-like, shape (n_samples,)
        The target variable containing class labels.
    Returns:
    - class_weights: dict
        A dictionary containing the class weights for each class label.
    """
    class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y),
                                        y = y
                                    )
    return dict(zip(np.unique(y), class_weights))

class_weights = calculate_class_weights(y)

## Baseline model

In [11]:
baseline_model = RandomForestClassifier(n_estimators=100, 
                                        random_state=1, n_jobs=-1, 
                                        class_weight=class_weights)

baseline_model.fit(X_train, y_train)

In [12]:
y_pred = baseline_model.predict(X_test)

accuracy_score(y_test, y_pred)

0.5391600454029511

## Model building

### Grid seach 1

In [14]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2500, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 200, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20, 25, 30]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10, 15, 20, 25, 30]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [15]:
model_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                                  param_distributions = random_grid, 
                                  n_iter = 10, cv = 5, random_state=1, 
                                  n_jobs = -1, verbose=3)

model_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bootstrap=True, max_depth=178, max_features=sqrt, min_samples_leaf=6, min_samples_split=30, n_estimators=633;, score=0.534 total time=   5.6s
[CV 2/5] END bootstrap=True, max_depth=178, max_features=sqrt, min_samples_leaf=6, min_samples_split=30, n_estimators=633;, score=0.535 total time=   5.7s
[CV 2/5] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=25, n_estimators=900;, score=0.533 total time=   7.9s
[CV 4/5] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=25, n_estimators=900;, score=0.529 total time=   7.9s
[CV 3/5] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=25, n_estimators=900;, score=0.528 total time=   7.9s
[CV 1/5] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=8, min_samples_split=25, n_estimators=900;, score=0.532 total time=   7.9s
[

In [16]:
model_random.best_params_

{'n_estimators': 2500,
 'min_samples_split': 10,
 'min_samples_leaf': 10,
 'max_features': 'sqrt',
 'max_depth': 48,
 'bootstrap': False}

In [17]:
model_random.best_score_

0.5339479775124396

In [25]:
param_grid = {'n_estimators': [2000, 2500, 3000],
 'min_samples_split': [9, 10, 11],
 'min_samples_leaf': [9, 9.5, 10, 10.5, 11],
 'max_features': ['sqrt'],
 'max_depth':[47, 48, 49],
 'bootstrap': [False]}

grid_search = GridSearchCV(estimator = RandomForestClassifier(), 
                           param_grid = param_grid,
                           cv = 3, n_jobs=-1,
                           verbose = 3)

In [26]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 135 candidates, totalling 405 fits


[CV 2/3] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=9, min_samples_split=10, n_estimators=2000;, score=0.534 total time=  20.2s
[CV 1/3] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=9, min_samples_split=10, n_estimators=2000;, score=0.531 total time=  20.2s
[CV 1/3] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=9, min_samples_split=9, n_estimators=2000;, score=0.532 total time=  20.2s
[CV 2/3] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=9, min_samples_split=9, n_estimators=2000;, score=0.533 total time=  20.3s
[CV 3/3] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=9, min_samples_split=10, n_estimators=2000;, score=0.533 total time=  20.6s
[CV 3/3] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=9, min_samples_split=9, n_estimators=2000;, score=0.533 total time=  20.6s
[CV 1/3] END bootstrap=False, max_depth=47, max_features=sqrt, min_

KeyboardInterrupt: 

## XGBoost

In [9]:
import xgboost as xgb
from xgboost import XGBClassifier

In [13]:
xgb = XGBClassifier(random_state=42, n_jobs=-1, objective='multi:softmax').fit(X_smote, y_smote)

y_pred = xgb.predict(X_test)

accuracy_score(y_test, y_pred)

0.4982973893303065

In [14]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Assuming your dataset is stored in X_train and y_train variables
# X_train should contain the training features, and y_train should contain the corresponding labels

# Instantiate the XGBClassifier
xgb = XGBClassifier(random_state=42)

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [3, 4, 5],           # Maximum depth of each tree
    'learning_rate': [0.1, 0.01, 0.001]  # Learning rate
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy')

# Perform grid search on the training data
grid_search.fit(X_smote, y_smote)

# Print the best hyperparameters found
print("Best Hyperparameters: ", grid_search.best_params_)

# Print the best accuracy score found during the grid search
print("Best Accuracy Score: ", grid_search.best_score_)