In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.svm import SVC

In [2]:
#load training and test set
training_set = pd.read_parquet("train_set (1).parquet")
testing_set = pd.read_parquet("holdout_set (1).parquet")

## Pre-Processing

In [3]:
#some rows have 2 or more labels, use explode so each row has only 1 label
training_set_exploded = training_set.explode('labels')
testing_set_exploded = testing_set.explode('labels')

In [4]:
training_set_exploded

Unnamed: 0,timestamp,x,y,z,labels,filename
13497564,0.010,0.219971,-2.150879,-1.247314,SM,aadi_ga_20150123_1.parquet
26846227,0.020,0.124756,-1.658203,-0.735352,SM,aadi_ga_20150123_1.parquet
6305228,0.040,0.148926,-1.443359,-0.931641,SM,aadi_ga_20150123_1.parquet
30245674,0.060,0.139893,-1.896484,-1.113281,SM,aadi_ga_20150123_1.parquet
11300293,0.080,0.358154,-2.125977,-1.261963,SM,aadi_ga_20150123_1.parquet
...,...,...,...,...,...,...
159516,622.109,-0.359375,0.531250,-0.750000,SM,zwicky_ga_20150629_1.parquet
27452467,622.129,-0.843750,-0.234375,-1.218750,SM,zwicky_ga_20150629_1.parquet
644896,622.139,-1.156250,-1.546875,-1.406250,SM,zwicky_ga_20150629_1.parquet
11101991,622.159,0.546875,-6.406250,-0.921875,SM,zwicky_ga_20150629_1.parquet


In [5]:
training_set_exploded.shape

(79732876, 6)

In [6]:
training_set_exploded = training_set_exploded.drop_duplicates()

testing_set_exploded = testing_set_exploded.drop_duplicates()

In [7]:
training_set_exploded.shape

(79729451, 6)

In [8]:
null_x = training_set_exploded["x"].isnull().sum()
null_y = training_set_exploded["y"].isnull().sum()
null_z = training_set_exploded["z"].isnull().sum()
null_labels = training_set_exploded["labels"].isnull().sum()

null_x, null_y, null_z, null_labels

(np.int64(234032), np.int64(234032), np.int64(234032), np.int64(0))

In [10]:
# Initialize the SimpleImputer
imputer = SimpleImputer(strategy='median')

# Apply imputer to 'x', 'y', 'z' columns on training data
training_set_exploded[['x', 'y', 'z']] = imputer.fit_transform(training_set_exploded[['x', 'y', 'z']])

# Apply the same imputation strategy to the test set using the same imputer
testing_set_exploded[['x', 'y', 'z']] = imputer.transform(testing_set_exploded[['x', 'y', 'z']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_set_exploded[['x', 'y', 'z']] = imputer.fit_transform(training_set_exploded[['x', 'y', 'z']])


In [11]:
mask = training_set_exploded["labels"].isin(["X1", "20-0", "1-2", "2-0", "23-2"])
filtered_training_set = training_set_exploded[mask]

test_mask = testing_set_exploded["labels"].isin(["X1", "20-0", "1-2", "2-0", "23-2"])
filtered_testing_set = testing_set_exploded[test_mask]

## Creating the models

In [21]:
sampled_training_set = filtered_training_set.sample(n=200000, random_state=42)
X_train = sampled_training_set[['x', 'y', 'z']]
y_train = sampled_training_set['labels']

#X_train = filtered_training_set[['x', 'y', 'z']]
#y_train = filtered_training_set['labels']

# For testing data
X_test = filtered_testing_set[['x', 'y', 'z']]
y_test = filtered_testing_set['labels']

In [22]:

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [23]:
X_train_scaled.shape

(200000, 3)

### Random Forest

In [24]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train_scaled, y_train)

In [25]:
y_pred = random_forest.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.6544226868755761


### Decision Trees

In [29]:
tree = DecisionTreeClassifier(random_state=42)

#grid search for dt
param_grid_dt = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search_dt = GridSearchCV(tree, param_grid_dt, cv=5, scoring='f1_weighted')

grid_search_dt.fit(X_train_scaled, y_train)

print("Best parameters found: ", grid_search_dt.best_params_)

print("Best grid search F1 weighted score: ", grid_search_dt.best_score_)

best_dt = grid_search_dt.best_estimator_

y_test_pred_dt = best_dt.predict(X_test_scaled)

test_f1_score_dt = f1_score(y_test, y_test_pred_dt, average='weighted')
print(f"Test F1 Score: {test_f1_score_dt}")

### K-Nearest Neighbours

In [28]:
knn = KNeighborsClassifier()

#grid search for knn
param_grid = {
    'n_neighbors': [2, 3, 5, 7, 10, 15, 18, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search_knn = GridSearchCV(knn, param_grid, cv=5, scoring='f1_weighted')

grid_search_knn.fit(X_train_scaled, y_train)

print("Best parameters found: ", grid_search_knn.best_params_)

best_knn = grid_search_knn.best_estimator_

y_test_pred = best_knn.predict(X_test_scaled)

test_f1_score = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test F1 Score: {test_f1_score}")

Best parameters found:  {'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'uniform'}
Test F1 Score: 0.6682146952018578
