In [2]:
pip install seaborn scikit-learn matplotlib

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Collecting matplotlib
  Using cached matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
Collecting scipy>=1.6.0
  Using cached scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (324 kB)
Collecting pyparsing>=2.3.1
  Using cached pyparsing-3.2.0-py3-none-any.whl (106 kB)
Collecting pillow>=8
  Using cached pillow-

In [21]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [22]:
# Get Training Data and Holdout data
path_to_train_holdout = 'ml_data_train_holdout/'
files = list(Path(path_to_train_holdout).glob('*.parquet'))
print(files)
training_file = files[0]
training_data = pd.read_parquet(training_file)
training_data.head()
#holdout_file = files[1]
#holdout_data = pd.read_parquet(holdout_file)
#holdout_data.head()
print(f'Amount of Training Data: {len(training_data)}')

[PosixPath('ml_data_train_holdout/train_set.parquet'), PosixPath('ml_data_train_holdout/holdout_set.parquet')]
Amount of Training Data: 52894345


# Only Using 1 label

In [9]:
# Split into training and testing

sampled_df = training_data.sample(n=600000, random_state=42)
# Because the label is a list value, I'll only take first label for now
def extract_first_label(labels):
    return labels[0]

def is_length_one(label_list):
    return len(label_list) == 1

# Only look at rows with 1 label
sampled_df = sampled_df[sampled_df['labels'].apply(is_length_one)]

sampled_df['labels'] = sampled_df['labels'].apply(extract_first_label)
print(sampled_df.head(10))

# Remove any Nan from dataframe
sampled_df = sampled_df.dropna()
sampled_df = sampled_df[sampled_df.labels != 'NULL']

# Take feature and target label
features = sampled_df[['x', 'y', 'z']]
target = sampled_df['labels']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

          timestamp         x         y         z labels  \
16822003    757.530 -0.092773 -0.808838 -0.648926      H   
51747       412.624  0.656250 -0.312500 -0.937500   NULL   
19163301     61.349  0.632813 -0.233643  0.663818   20-0   
14507399    139.959  0.669922 -0.549316  0.293945   21-2   
8940657      15.831  0.247559  1.083740  0.464844   NULL   
24760431    587.120 -1.066162  0.638428  0.100098   33-0   
26120001    375.940 -0.900635 -0.437500  0.177246    1-2   
11692425    320.833 -0.758789  1.049072  0.639160    5-1   
15941543   2245.080 -0.774170 -0.628418 -0.128906    1-1   
12370845    324.754 -0.697510 -0.565186 -0.417725    1-C   

                                       filename  
16822003              mac_ga_20150206_4.parquet  
51747               swade_ga_20150528_1.parquet  
19163301       kiss_drinking_20220504_1.parquet  
14507399    leda_lickingpaws_20221107_1.parquet  
8940657     hardee_urination_20220429_1.parquet  
24760431             aadi_ga_20150123_3

## Random Forest

In [6]:
# First need to use minmax scaler on data
# Initializing the MinMaxScaler
scaler = MinMaxScaler()

# Transforming the data to be in the range [0, 1]
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

In [7]:
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy:.4f}')
f1 = f1_score(y_test, preds, average='weighted')
print(f'F1-score: {f1:.4f}')

Accuracy: 0.3956
F1-score: 0.3702


## Naive Bayes

In [10]:
# First need to use minmax scaler on data
# Initializing the MinMaxScaler
scaler = MinMaxScaler()

# Transforming the data to be in the range [0, 1]
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test, nb_preds)
print(f'Accuracy: {accuracy:.4f}')
f1 = f1_score(y_test, nb_preds, average='weighted')
print(f'F1-score: {f1:.4f}')

Accuracy: 0.2291
F1-score: 0.0854


# Grouping labels together into string
In the previous models, I only looked at rows with single label, this time I will group the labels into a string if there are more than 1

In [10]:
sampled_df_2 = training_data.sample(n=40000, random_state=42)

# Ensure labels are properly formatted
def clean_labels(labels):
    if isinstance(labels, (list, np.ndarray)):
        return '_'.join(labels)  # Convert array to string
    return label

sampled_df_2['labels'] = sampled_df_2['labels'].apply(clean_labels)
# We will still remove nulls
# Remove any Nan from dataframe
sampled_df_2 = sampled_df_2.dropna()
sampled_df_2 = sampled_df_2[sampled_df_2.labels != 'NULL']
print(sampled_df_2.head())

# Now split into testing and training

# Take feature and target label
features_2 = sampled_df_2[['x', 'y', 'z']]
target_2 = sampled_df_2['labels']

# Split into training and testing
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(features_2, target_2, test_size=0.2, random_state=42)

          timestamp         x         y         z    labels  \
16822003    757.530 -0.092773 -0.808838 -0.648926         H   
32760826    460.330 -0.593750 -1.140625 -0.531250  2-4_27-0   
19163301     61.349  0.632813 -0.233643  0.663818      20-0   
14507399    139.959  0.669922 -0.549316  0.293945      21-2   
44941490    852.061 -0.766846  0.375244  0.566162     1-2_H   

                                     filename  
16822003            mac_ga_20150206_4.parquet  
32760826         bailee_ga_20140311_1.parquet  
19163301     kiss_drinking_20220504_1.parquet  
14507399  leda_lickingpaws_20221107_1.parquet  
44941490          oscar_ga_20150128_1.parquet  


## Random Forest

In [11]:
# First need to use minmax scaler on data
# Initializing the MinMaxScaler
#scaler = MinMaxScaler()

# Transforming the data to be in the range [0, 1]
#X_train_2 = scaler.fit_transform(X_train_2)
#X_test_2 = scaler.transform(X_test_2)

# Train Model
rf_model_2 = RandomForestClassifier(verbose=5)
rf_model_2.fit(X_train_2, y_train_2)
rf_preds_2 = rf_model_2.predict(X_test_2)

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100


[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:   11.1s


building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59

[Parallel(n_jobs=1)]: Done  71 tasks      | elapsed:   46.5s


building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  71 tasks      | elapsed:    1.4s


In [12]:
accuracy = accuracy_score(y_test_2, rf_preds_2)
print(f'Accuracy: {accuracy:.4f}')
f1 = f1_score(y_test_2, rf_preds_2, average='weighted')
print(f'F1-score: {f1:.4f}')

Accuracy: 0.1578
F1-score: 0.1372


## Naive Bayes

In [7]:
# First need to use minmax scaler on data
# Initializing the MinMaxScaler
scaler = MinMaxScaler()

# Transforming the data to be in the range [0, 1]
X_train_2 = scaler.fit_transform(X_train_2)
X_test_2 = scaler.transform(X_test_2)

nb_model_2 = MultinomialNB()
nb_model_2.fit(X_train_2, y_train_2)
nb_preds_2 = nb_model_2.predict(X_test_2)

In [9]:
accuracy = accuracy_score(y_test_2, nb_preds_2)
print(f'Accuracy: {accuracy:.4f}')
f1 = f1_score(y_test_2, nb_preds_2, average='weighted')
print(f'F1-score: {f1:.4f}')

Accuracy: 0.1041
F1-score: 0.0196


# Only Classifying Emotions

In [23]:
# Explode it first
exploded = training_data.explode('labels')
exploded.head()

Unnamed: 0,timestamp,x,y,z,labels,filename
13497564,0.01,0.219971,-2.150879,-1.247314,SM,aadi_ga_20150123_1.parquet
26846227,0.02,0.124756,-1.658203,-0.735352,SM,aadi_ga_20150123_1.parquet
6305228,0.04,0.148926,-1.443359,-0.931641,SM,aadi_ga_20150123_1.parquet
30245674,0.06,0.139893,-1.896484,-1.113281,SM,aadi_ga_20150123_1.parquet
11300293,0.08,0.358154,-2.125977,-1.261963,SM,aadi_ga_20150123_1.parquet


In [24]:
filtered_movements = exploded[exploded['labels'].isin(['26-0','5-0','12-0','4-0','18-0','X1','19-0','20-0'])]

In [25]:
print(len(filtered_movements))

1282262


In [32]:

filtered_movements.labels.unique()
x = exploded['labels'].value_counts()
#for a, b in exploded['labels'].value_counts():
 #   print(a, b)
    
print(x)

1-2      12246614
NULL      9841366
2-0       7367810
H         4688794
23-2      4485260
           ...   
5-0            54
4-1            38
21-B           18
1-0             5
1-C2            2
Name: labels, Length: 166, dtype: int64


In [20]:
exploded.labels.unique()

array(['SM', 'NULL', '2-0', '1-2', '2-4', '2-7', '1-1', '21-1D', '21-1A',
       '1-C2', '1-C', '21-1B', '5-1', '1-C1', '21-1C', 'H', '1-A2', '4-2',
       '5-5', '4-1', '50-0', '33-0', '2-3B', '1-3', '29-4', '29-3', '3-1',
       '43-0', '1-A1', '21-5', '41-0', '35-0', '19-2', '3-2', '4-0',
       '35-1', '36-0', '44-0', '40-6', 'P', '23-2', '40-5', '22-2',
       '23-1', '22-1', '46-0', '20-0', '30-1', '23-3', '40-2', '29-0',
       '23-4', '27-0', '3-0', '29-2', '29-1', '30-0', '32-0', '40-7',
       '21-2', '40-4', '28-0', '19-1', '21-4', '2-6A', '5-3', '1-5',
       '2-6', '1-B2', '1-B1', '2-5', '5-4', '45-0', '29-6', '3-3', '5-2',
       'X1', '3-6', '3-5', '40-0', '40-3', 'S', '48-0', '21-3', '1-U',
       '40-1', '21-1', '2', '28', '46', '26-0', '2-8', '31-0', '26-2',
       '35-2', '42-0', '4', '00', '26-1', '39', '29-5', '21-0', '1-2 ',
       'H  ', '2-0  ', '2-4 ', '2-0 ', '2-4  ', '1-2  ', '5-5 ', '1-C2  ',
       '5-5  ', ' 2-0', ' 1-C1 ', '1-2   ', 'H ', ' 5-5', '2-3A', 