In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [None]:
#load training and test set
training_set = pd.read_parquet("ml_data_train_holdout/train_set.parquet")
testing_set = pd.read_parquet("ml_data_train_holdout/holdout_set.parquet")

In [None]:
#check the data
training_set.head(5)

### Pre-processing Steps

In [None]:
#some rows have 2 or more labels, use explode so each row has only 1 label
training_set_exploded = training_set.explode('labels')
testing_set_exploded = testing_set.explode('labels')

In [None]:
#check the number of rows and columns
training_set_exploded.shape

In [None]:
#drop duplicate values
training_set_exploded = training_set_exploded.drop_duplicates()
testing_set_exploded = testing_set_exploded.drop_duplicates()

In [None]:
#check if there were any duplicates
training_set_exploded.shape

In [None]:
#only use the columns that we seek to analyse
mask = training_set_exploded["labels"].isin(["X1", "20-0", "1-2", "2-0", "23-2"])
filtered_training_set = training_set_exploded[mask]

test_mask = testing_set_exploded["labels"].isin(["X1", "20-0", "1-2", "2-0", "23-2"])
filtered_testing_set = testing_set_exploded[test_mask]

In [None]:
null_x = filtered_training_set["x"].isnull().sum()
null_y = filtered_training_set["y"].isnull().sum()
null_z = filtered_training_set["z"].isnull().sum()
null_labels = filtered_training_set["labels"].isnull().sum()

null_x, null_y, null_z, null_labels

In [None]:
#all numeric cols have the same number of nulls check if this is a behviour
x_null_mask = filtered_training_set['x'].isnull()
filtered_training_set[x_null_mask]

In [None]:
#drop these rows
filtered_training_set = filtered_training_set.dropna()
filtered_testing_set = filtered_testing_set.dropna()

### Creating the Model

In [None]:
X_train = filtered_training_set[['x', 'y', 'z']]
y_train = filtered_training_set['labels']

X_test = filtered_testing_set[['x', 'y', 'z']]
y_test = filtered_testing_set['labels']

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#use grid search to get the best hyperparameters

tree = DecisionTreeClassifier(random_state=42)

#potential parameters for decision tree
param_grid_dt = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search_dt = GridSearchCV(tree, param_grid_dt, cv=5, scoring='f1_weighted')

grid_search_dt.fit(X_train_scaled, y_train)

print("Best parameters found: ", grid_search_dt.best_params_)

best_dt = grid_search_dt.best_estimator_

y_test_pred_dt = best_dt.predict(X_test_scaled)

test_f1_score_dt = f1_score(y_test, y_test_pred_dt, average='weighted')

print(f"Test F1 Score: {test_f1_score_dt}")

### Bagging using the best hyperparmeters

Bagging can be used to reduce variance in models. It is a sort of ensemble learning in which many models trained on various subsets of the training data are combined to produce a more accurate and robust model.

In [None]:
best_params_dt = {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 20, 'random_state':42}

optimal_dt = DecisionTreeClassifier(**best_params_dt)

bagging_dt = BaggingClassifier(optimal_dt, n_estimators=10, random_state=42)
bagging_dt.fit(X_train_scaled, y_train)


y_test_pred = bagging_dt.predict(X_test_scaled)

weighted_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Weighted F1 Score: {weighted_f1}")