In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In this notebook we will use grid search to find the best hyperparameters for KNN, Decision Trees and Nueral Networks

In [2]:
#load training and test set
training_set = pd.read_parquet("ml_data_train_holdout/train_set.parquet")
testing_set = pd.read_parquet("ml_data_train_holdout/holdout_set.parquet")

## Pre-Processing

In [3]:
#some rows have 2 or more labels, use explode so each row has only 1 label
training_set_exploded = training_set.explode('labels')
testing_set_exploded = testing_set.explode('labels')

In [4]:
training_set_exploded

Unnamed: 0,timestamp,x,y,z,labels,filename
13497564,0.010,0.219971,-2.150879,-1.247314,SM,aadi_ga_20150123_1.parquet
26846227,0.020,0.124756,-1.658203,-0.735352,SM,aadi_ga_20150123_1.parquet
6305228,0.040,0.148926,-1.443359,-0.931641,SM,aadi_ga_20150123_1.parquet
30245674,0.060,0.139893,-1.896484,-1.113281,SM,aadi_ga_20150123_1.parquet
11300293,0.080,0.358154,-2.125977,-1.261963,SM,aadi_ga_20150123_1.parquet
...,...,...,...,...,...,...
159516,622.109,-0.359375,0.531250,-0.750000,SM,zwicky_ga_20150629_1.parquet
27452467,622.129,-0.843750,-0.234375,-1.218750,SM,zwicky_ga_20150629_1.parquet
644896,622.139,-1.156250,-1.546875,-1.406250,SM,zwicky_ga_20150629_1.parquet
11101991,622.159,0.546875,-6.406250,-0.921875,SM,zwicky_ga_20150629_1.parquet


In [5]:
training_set_exploded.shape

(79732876, 6)

In [6]:
training_set_exploded = training_set_exploded.drop_duplicates()

testing_set_exploded = testing_set_exploded.drop_duplicates()

In [7]:
# Trim whitespace from 'labels' column in both datasets
training_set_exploded['labels'] = training_set_exploded['labels'].str.strip()
testing_set_exploded['labels'] = testing_set_exploded['labels'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_set_exploded['labels'] = training_set_exploded['labels'].str.strip()


In [8]:
training_set_exploded.shape

(79729451, 6)

In [9]:
null_x = training_set_exploded["x"].isnull().sum()
null_y = training_set_exploded["y"].isnull().sum()
null_z = training_set_exploded["z"].isnull().sum()
null_labels = training_set_exploded["labels"].isnull().sum()

null_x, null_y, null_z, null_labels

(np.int64(234032), np.int64(234032), np.int64(234032), np.int64(0))

In [10]:
#remove null values (currently drop, can be imputed with mean etc)
training_set_exploded = training_set_exploded.dropna()
testing_set_exploded = testing_set_exploded.dropna()

In [11]:
mask = training_set_exploded["labels"].isin(["X1", "20-0", "1-2", "2-0", "23-2"])
filtered_training_set = training_set_exploded[mask]

test_mask = testing_set_exploded["labels"].isin(["X1", "20-0", "1-2", "2-0", "23-2"])
filtered_testing_set = testing_set_exploded[test_mask]

## Creating the models

In [12]:
sampled_training_set = filtered_training_set.sample(n=200000, random_state=42)
X_train = sampled_training_set[['x', 'y', 'z']]
y_train = sampled_training_set['labels']

#X_train = filtered_training_set[['x', 'y', 'z']]
#y_train = filtered_training_set['labels']

# For testing data
X_test = filtered_testing_set[['x', 'y', 'z']]
y_test = filtered_testing_set['labels']

In [13]:

scaler = MinMaxScaler()

#scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [14]:
X_train_scaled.shape

(200000, 3)

### Decision Trees

In [15]:
tree = DecisionTreeClassifier(random_state=42)

#grid search for dt
param_grid_dt = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search_dt = GridSearchCV(tree, param_grid_dt, cv=5, scoring='f1_weighted')

grid_search_dt.fit(X_train_scaled, y_train)

print("Best parameters found: ", grid_search_dt.best_params_)

best_dt = grid_search_dt.best_estimator_

y_test_pred_dt = best_dt.predict(X_test_scaled)

test_f1_score_dt = f1_score(y_test, y_test_pred_dt, average='weighted')
print(f"Test F1 Score: {test_f1_score_dt}")

Best parameters found:  {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 20}
Test F1 Score: 0.655888421613866


### K-Nearest Neighbours

In [16]:
knn = KNeighborsClassifier()

#grid search for knn
param_grid = {
    'n_neighbors': [2, 3, 5, 7, 10, 15, 18, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search_knn = GridSearchCV(knn, param_grid, cv=5, scoring='f1_weighted')

grid_search_knn.fit(X_train_scaled, y_train)

print("Best parameters found: ", grid_search_knn.best_params_)

best_knn = grid_search_knn.best_estimator_

y_test_pred = best_knn.predict(X_test_scaled)

test_f1_score = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test F1 Score: {test_f1_score}")

Best parameters found:  {'metric': 'euclidean', 'n_neighbors': 21, 'weights': 'uniform'}
Test F1 Score: 0.6690005356498379


#### Nueral Network

In [19]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, solver='adam', random_state=42)

mlp.fit(X_train_scaled, y_train)

y_pred = mlp.predict(X_test_scaled)

weighted_f1 = f1_score(y_test, y_test_pred, average='weighted')
print("F1-Score of the MLP classifier:", weighted_f1)

F1-Score of the MLP classifier: 0.6733125059285231


## Performing bagging on the best hyperparameters

#### KNN

In [17]:
best_params_knn = {'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'uniform'}

optimal_knn = KNeighborsClassifier(**best_params_knn)

bagging_knn= BaggingClassifier(optimal_knn, n_estimators=10, random_state=42)
bagging_knn.fit(X_train_scaled, y_train)


y_test_pred = bagging_knn.predict(X_test_scaled)

weighted_f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Weighted F1 Score: {weighted_f1}")

Weighted F1 Score: 0.6709222206506011


#### Decision Trees

This is the best so far

In [18]:
best_params_dt = {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 20, 'random_state':42}

optimal_dt = DecisionTreeClassifier(**best_params_dt)

bagging_dt = BaggingClassifier(optimal_dt, n_estimators=10, random_state=42)
bagging_dt.fit(X_train_scaled, y_train)


y_test_pred = bagging_dt.predict(X_test_scaled)

weighted_f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Weighted F1 Score: {weighted_f1}")

Weighted F1 Score: 0.6733125059285231
