In [1]:
pip install seaborn scikit-learn matplotlib

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Collecting matplotlib
  Using cached matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Using cached scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (324 kB)
Collecting pyparsing>=2.3.1
  Using cached pyparsing-3.2.0-py3-none-any.whl (106 kB)
Collecting fonttools>=4.22.0
  Using cached

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.svm import SVC

### Pre-processing

In [2]:
# Loading the data
training_set = pd.read_parquet("ml_data_train_holdout/train_set.parquet")
testing_set = pd.read_parquet("ml_data_train_holdout/holdout_set.parquet")

In [3]:
# Sample data
sampled_training_set = training_set.sample(n=100000, random_state=42)
sampled_testing_set = testing_set.sample(n=100000, random_state=42)

In [4]:
# Exploding the labels
training_set_exploded = sampled_training_set.explode('labels')
testing_set_exploded = sampled_testing_set.explode('labels')

In [5]:
imputer = SimpleImputer(strategy='mean')
training_set_imputed = imputer.fit_transform(training_set_exploded[['x', 'y', 'z']])
testing_set_imputed = imputer.transform(testing_set_exploded[['x', 'y', 'z']])

In [6]:
training_set_exploded[['x', 'y', 'z']] = training_set_imputed
testing_set_exploded[['x', 'y', 'z']] = testing_set_imputed

# Drop duplicates
training_set_clean = training_set_exploded.drop_duplicates()
testing_set_clean = testing_set_exploded.drop_duplicates()

In [7]:
# Define IQR filter function
def remove_outliers(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Remove outliers
training_set_filtered = remove_outliers(training_set_clean, ['x', 'y', 'z'])
testing_set_filtered = remove_outliers(testing_set_clean, ['x', 'y', 'z'])

### Checking Labels

In [16]:
# Count numbers of each label
label_counts = training_set_filtered['labels'].value_counts()
print("\nTraining set:", label_counts)

label_counts_2 = testing_set_filtered['labels'].value_counts()
print("\nTesting set:", label_counts_2)

# Count and print labels
unique_labels = training_set_filtered['labels'].unique()
print("Unique labels in training set:", unique_labels)

unique_labels_2 = testing_set_filtered['labels'].unique()
print("Unique labels in testing set:", unique_labels)


Training set: 1-2      23418
NULL     18066
2-0      13558
H         9152
23-2      8058
         ...  
1-4          2
28           2
34-1         1
21-2D        1
5-55         1
Name: labels, Length: 115, dtype: int64

Testing set: NULL    30486
1-2     15612
2-0     10242
2-4      4372
1-1      3576
        ...  
5-4         1
00          1
2-3         1
2-2         1
41-0        1
Name: labels, Length: 99, dtype: int64
Unique labels in training set: ['H' 'NULL' '2-4' '27-0' '20-0' '21-2' '1-2' '44-0' '33-0' '2-0' '40-6'
 '5-1' '1-1' '1-C' '35-0' '23-2' '46-0' '21-1A' '22-2' '3-2' '3-1' '21-5'
 '19-1' 'P' '5-5' '3-4' '30-0' '1-C2' '30-1' '1-C1' '48-0' '28-0' '35-1'
 '1-A2' '12-C2' '29-3' '23-3' '36-0' '1-B2' '1-3' '21-1D' '23-1' '1-A1'
 '43-0' '32-0' '1-U' '1-B1' '45-0' '12-B1' 'SM' '12-B2' '21-4' '40-2' 'X1'
 '21-1C' '4-1' '5-2' '5-3' '19-2' '50-0' '40-5' '31-0' '23-4' '21-1' 'S'
 '2-7' '2-6' '22-1' '29-1' '12-A1' '2-5' '29-4' '3-0' '29-0' '40-4' '29-2'
 '41-0' '26-0' '3-3' '12-A2'

### Removing White Spaces in Labels

In [13]:
training_set_filtered.loc[: , 'labels'] = training_set_filtered['labels'].str.strip()

In [14]:
testing_set_filtered.loc[: , 'labels'] = testing_set_filtered['labels'].str.strip()

In [17]:
label_counts = training_set_filtered['labels'].value_counts()
print("\nLabel counts in training set:",label_counts)

label_counts_2 = testing_set_filtered['labels'].value_counts()
print("\nLabel counts in testing set:",label_counts_2)

# Count and print labels
unique_labels = training_set_filtered['labels'].unique()
print("Unique labels in training set:", unique_labels)

unique_labels_2 = testing_set_filtered['labels'].unique()
print("Unique labels in testing set:", unique_labels)


Label counts in training set: 1-2      23418
NULL     18066
2-0      13558
H         9152
23-2      8058
         ...  
1-4          2
28           2
34-1         1
21-2D        1
5-55         1
Name: labels, Length: 115, dtype: int64

Label counts in testing set: NULL    30486
1-2     15612
2-0     10242
2-4      4372
1-1      3576
        ...  
5-4         1
00          1
2-3         1
2-2         1
41-0        1
Name: labels, Length: 99, dtype: int64
Unique labels in training set: ['H' 'NULL' '2-4' '27-0' '20-0' '21-2' '1-2' '44-0' '33-0' '2-0' '40-6'
 '5-1' '1-1' '1-C' '35-0' '23-2' '46-0' '21-1A' '22-2' '3-2' '3-1' '21-5'
 '19-1' 'P' '5-5' '3-4' '30-0' '1-C2' '30-1' '1-C1' '48-0' '28-0' '35-1'
 '1-A2' '12-C2' '29-3' '23-3' '36-0' '1-B2' '1-3' '21-1D' '23-1' '1-A1'
 '43-0' '32-0' '1-U' '1-B1' '45-0' '12-B1' 'SM' '12-B2' '21-4' '40-2' 'X1'
 '21-1C' '4-1' '5-2' '5-3' '19-2' '50-0' '40-5' '31-0' '23-4' '21-1' 'S'
 '2-7' '2-6' '22-1' '29-1' '12-A1' '2-5' '29-4' '3-0' '29-0' '40-4' '29

# Picking a subset of labels

In [18]:
filtered_behaviors = training_set_filtered[training_set_filtered['labels'].isin(['X1','20-0','1-2','2-0','23-2'])]
filtered_behaviors_test = testing_set_filtered[testing_set_filtered['labels'].isin(['X1','20-0','1-2','2-0','23-2'])]

In [19]:
filtered_behaviors.head()

Unnamed: 0,timestamp,x,y,z,labels,filename
19163301,61.349,0.632813,-0.233643,0.663818,20-0,kiss_drinking_20220504_1.parquet
44941490,852.061,-0.766846,0.375244,0.566162,1-2,oscar_ga_20150128_1.parquet
33141156,538.12,0.21875,-0.296875,-0.875,1-2,bella_ga_20141023_1.parquet
33372593,74.807,0.017578,-0.364258,0.873047,2-0,betty_barking_20230601_1.parquet
26120001,375.94,-0.900635,-0.4375,0.177246,1-2,bella_ga_20150602_3.parquet


In [20]:
filtered_behaviors_test.head()

Unnamed: 0,timestamp,x,y,z,labels,filename
560531,561.14,0.59375,0.0625,0.703125,1-2,abe_ga_20150505_1.parquet
38360539,2429.551,-0.94043,-0.395508,0.435791,1-2,oscar_ga_20150128_1.parquet
48186709,689.879,-0.081787,0.286377,0.925781,1-2,sandy_ga_20140625_1.parquet
12111689,114.963,-0.5625,-0.015625,0.296875,2-0,captain_ga_20140723_1.parquet
47772195,527.971,-0.758789,0.782471,0.166016,1-2,sammy_ga_20150624_1.parquet


In [21]:
print(len(filtered_behaviors))
print(len(filtered_behaviors_test))

47434
28315


In [22]:
filtered_behaviors.labels.unique()

array(['20-0', '1-2', '2-0', '23-2', 'X1'], dtype=object)

In [23]:
filtered_behaviors_test.labels.unique()

array(['1-2', '2-0', '20-0', '23-2', 'X1'], dtype=object)

In [24]:
print(filtered_behaviors['labels'].value_counts())

1-2     23418
2-0     13558
23-2     8058
20-0     2209
X1        191
Name: labels, dtype: int64


In [25]:
print(filtered_behaviors_test['labels'].value_counts())

1-2     15612
2-0     10242
20-0     2440
23-2       19
X1          2
Name: labels, dtype: int64


## Building Models

In [26]:
# For training data
X_train = filtered_behaviors[['x', 'y', 'z']]
y_train = filtered_behaviors['labels']

# For testing data
X_test = filtered_behaviors_test[['x', 'y', 'z']]
y_test = filtered_behaviors_test['labels']

In [27]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

### Random Forest

In [28]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train_scaled, y_train)

In [29]:
y_pred = random_forest.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.6552196351485224


#### Grid Search

In [None]:
# Define the model
random_forest_gs = RandomForestClassifier(random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=random_forest_gs,
                           param_grid=param_grid,
                           cv=5, 
                           n_jobs=-1,
                           verbose=2)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


### K Nearest Neighbour

In [26]:
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train_scaled, y_train)

In [27]:
y_pred = knn.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.5805110700559294


In [28]:
neighbor_params = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for n in neighbor_params:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train_scaled, y_train)
    y_test_pred = knn.predict(X_test_scaled)
    test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
    print(f"n_neighbors: {n}, Testing F1 Score: {test_f1_score:}")

n_neighbors: 1, Testing F1 Score: 0.5805110700559294
n_neighbors: 2, Testing F1 Score: 0.5922277683225955
n_neighbors: 3, Testing F1 Score: 0.6019840802022207
n_neighbors: 4, Testing F1 Score: 0.6184377848855566
n_neighbors: 5, Testing F1 Score: 0.6283369800332369
n_neighbors: 6, Testing F1 Score: 0.6302803247046979
n_neighbors: 7, Testing F1 Score: 0.637426618597988
n_neighbors: 8, Testing F1 Score: 0.642116956258748
n_neighbors: 9, Testing F1 Score: 0.6418896682087929
n_neighbors: 10, Testing F1 Score: 0.644249813345106
n_neighbors: 11, Testing F1 Score: 0.646018349603187


### Decision Tree

In [29]:
decision_tree = DecisionTreeClassifier(random_state=42)

decision_tree.fit(X_train_scaled, y_train)

In [30]:
y_pred = decision_tree.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.5924408876622295


## Non - filtered

In [31]:
# For training data
X_train_2 = training_set_filtered[['x', 'y', 'z']]
y_train_2 = training_set_filtered['labels']

# For testing data
X_test_2 = testing_set_filtered[['x', 'y', 'z']]
y_test_2 = testing_set_filtered['labels']

In [32]:
scaler_2 = StandardScaler()

X_train_scaled_2 = scaler_2.fit_transform(X_train_2)

X_test_scaled_2 = scaler_2.transform(X_test_2)

In [33]:
## Random Forest
random_forest_2 = RandomForestClassifier()

random_forest_2.fit(X_train_scaled_2, y_train_2)