### Importing

In [2]:
pip install matplotlib

Collecting matplotlib
  Using cached matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Collecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (324 kB)
Collecting pillow>=8
  Using cached pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.4 MB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.54.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
Collecting pyparsing>=2.3.1
  Using cached pyparsing-3.2.0-py3-none-any.whl (106 kB)
Installing collected packages: pyparsing, pillow, kiwisolver, fonttools, cycler, contourpy, matplotlib
Successfully installed contourpy-1.3.1 cycler-0.12.1 fonttools-4.54.1 kiwisolver-1.4.7 matplotlib-3.9.2 pillow-11.0.0 pyparsing-3

In [3]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Collecting scipy>=1.6.0
  Using cached scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.2 scipy-1.14.1 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.svm import SVC

### Pre-processing

In [3]:
#load training and test set
training_set = pd.read_parquet("ml_data_train_holdout/train_set.parquet")
testing_set = pd.read_parquet("ml_data_train_holdout/holdout_set.parquet")

In [4]:
#get sample of data as currently doesn't load 
sampled_training_set = training_set.sample(n=100000)

In [5]:
#remove null values (currently drop, can be imputed with mean etc)
training_set = sampled_training_set.dropna()
testing_set = sampled_training_set.dropna()

In [6]:
#some rows have 2 or more labels, use explode so each row has only 1 label
training_set_exploded = training_set.explode('labels')
testing_set_exploded = testing_set.explode('labels')

### Building Models

In [7]:
# For training data
X_train = training_set_exploded[['x', 'y', 'z']]
y_train = training_set_exploded['labels']

# For testing data
X_test = testing_set_exploded[['x', 'y', 'z']]
y_test = testing_set_exploded['labels']

In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

#### KNN

In [9]:
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train_scaled, y_train)

In [10]:
y_pred = knn.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.6538497242236053


In [11]:
neighbor_params = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for n in neighbor_params:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train_scaled, y_train)
    y_test_pred = knn.predict(X_test_scaled)
    test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
    print(f"n_neighbors: {n}, Testing F1 Score: {test_f1_score:}")

n_neighbors: 1, Testing F1 Score: 0.6538497242236053
n_neighbors: 2, Testing F1 Score: 0.43124800338343255
n_neighbors: 3, Testing F1 Score: 0.3644005377840143
n_neighbors: 4, Testing F1 Score: 0.3267176456381955
n_neighbors: 5, Testing F1 Score: 0.31174406140701144
n_neighbors: 6, Testing F1 Score: 0.2996696419654422
n_neighbors: 7, Testing F1 Score: 0.29038723290316615
n_neighbors: 8, Testing F1 Score: 0.2805606231597595
n_neighbors: 9, Testing F1 Score: 0.2721853101749903
n_neighbors: 10, Testing F1 Score: 0.2642978722098082
n_neighbors: 11, Testing F1 Score: 0.2583676356178707


#### RANDOM FOREST

In [12]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train_scaled, y_train)

In [13]:
y_pred = random_forest.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.6549898317826762


#### Decision Tree

In [14]:
decision_tree = DecisionTreeClassifier(random_state=42)

decision_tree.fit(X_train_scaled, y_train)

In [15]:
y_pred = decision_tree.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.5875613869931946


#### SVM Linear

In [16]:
svm_linear = SVC(kernel='linear', random_state=42)

svm_linear.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_linear.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

#### SVM RBF

In [None]:
svm_rbf = SVC(kernel='rbf', random_state=42)

svm_rbf.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_rbf.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

#### SVM POLY

In [None]:
svm_poly = SVC(kernel='poly', degree=3, random_state=42)

svm_poly.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_poly.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

#### SVM SIGMOID

In [None]:
svm_sigmoid = SVC(kernel='sigmoid', random_state=42)

svm_sigmoid.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_sigmoid.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

#### Neural Networks

#### Ideas

Impute missing values rather than dropping values can use KNNImputer, remove duplicates, check for outliers (check skew and kurtosis, histogram) impute outliers using IQR approach (code below) , change scaler to MinMax, tune the parameters for all models, try models with different features ,eg. x and y, potentially try neural networks, use cross-validation to better evaluate the models.

In [None]:
#IQR approach for imputing outliers, do for all 3 x,y,z
'''
q1 = sampled_training_set["x"].quartile(0.25)
q3 = sampled_training_set["x"].quartile(0.75)

IQR = q3-q1
upper = q3 + 1.5 * IQR
lower = q1 - 1.5 * IQR

sampled_training_set["x"] = np.where(sampled_training_set["x"] > upper, upper,
                                     np.where(sampled_training_set["x"] < lower, lower,
                                              sampled_training_set["x"]))
'''