In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
# Get Training Data and Holdout data
path_to_train_holdout = 'ml_data_train_holdout/'
files = list(Path(path_to_train_holdout).glob('*.parquet'))
print(files)
training_file = files[0]
training_data = pd.read_parquet(training_file)
training_data.head()
#holdout_file = files[1]
#holdout_data = pd.read_parquet(holdout_file)
#holdout_data.head()

[PosixPath('ml_data_train_holdout/train_set.parquet'), PosixPath('ml_data_train_holdout/holdout_set.parquet')]


Unnamed: 0,timestamp,x,y,z,labels,filename
13497564,0.01,0.219971,-2.150879,-1.247314,[SM],aadi_ga_20150123_1.parquet
26846227,0.02,0.124756,-1.658203,-0.735352,[SM],aadi_ga_20150123_1.parquet
6305228,0.04,0.148926,-1.443359,-0.931641,[SM],aadi_ga_20150123_1.parquet
30245674,0.06,0.139893,-1.896484,-1.113281,[SM],aadi_ga_20150123_1.parquet
11300293,0.08,0.358154,-2.125977,-1.261963,[SM],aadi_ga_20150123_1.parquet


In [4]:
print(f'Amount of Training Data: {len(training_data)}')
#print(f'Amount of Holdout Data: {len(holdout_data)}')

Amount of Training Data: 52894345


In [5]:
# Load Ethogram
ethogram = pd.read_csv('ethogram_3.4.csv')
behaviors = pd.DataFrame(set(behavior for sublist in training_data['labels'] for behavior in sublist), columns=['ID']).merge(ethogram, on='ID', how='left')

In [31]:
behaviors.tail()

Unnamed: 0,ID,BEHAVIOUR,BEHAVIOUR DESCRIPTION
161,23-3,Tail wagging fast,"Tail moving side to side quickly, as in excite..."
162,1-A2,(Resting) Stretched out on right side,"Clearly awake, laying on right side, all legs ..."
163,5-5,,
164,21-12,,
165,20-0,Drinking - Unspecified,"Drinking from a water source, any position"


# Basic Random Foreest
Here I will train a very basic Naive Bayes Classifier, I will not use the full training set and I will take a test set from this sample.

In [5]:
sampled_df = training_data.sample(n=600000, random_state=42)
# Because the label is a list value, I'll only take first label for now
def extract_first_label(labels):
    return labels[0]

def is_length_one(label_list):
    return len(label_list) == 1

# Only look at rows with 1 label
sampled_df = sampled_df[sampled_df['labels'].apply(is_length_one)]

sampled_df['labels'] = sampled_df['labels'].apply(extract_first_label)
print(sampled_df.head(10))

# Remove any Nan from dataframe
sampled_df = sampled_df.dropna()
sampled_df = sampled_df[sampled_df.labels != 'NULL']

# Take feature and target label
features = sampled_df[['x', 'y', 'z']]
target = sampled_df['labels']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train Model
model = RandomForestClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)

          timestamp         x         y         z labels  \
16822003    757.530 -0.092773 -0.808838 -0.648926      H   
51747       412.624  0.656250 -0.312500 -0.937500   NULL   
19163301     61.349  0.632813 -0.233643  0.663818   20-0   
14507399    139.959  0.669922 -0.549316  0.293945   21-2   
8940657      15.831  0.247559  1.083740  0.464844   NULL   
24760431    587.120 -1.066162  0.638428  0.100098   33-0   
26120001    375.940 -0.900635 -0.437500  0.177246    1-2   
11692425    320.833 -0.758789  1.049072  0.639160    5-1   
15941543   2245.080 -0.774170 -0.628418 -0.128906    1-1   
12370845    324.754 -0.697510 -0.565186 -0.417725    1-C   

                                       filename  
16822003              mac_ga_20150206_4.parquet  
51747               swade_ga_20150528_1.parquet  
19163301       kiss_drinking_20220504_1.parquet  
14507399    leda_lickingpaws_20221107_1.parquet  
8940657     hardee_urination_20220429_1.parquet  
24760431             aadi_ga_20150123_3

In [6]:
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy:.4f}')
f1 = f1_score(y_test, preds, average='weighted')
print(f'F1-score: {f1:.4f}')

Accuracy: 0.3971
F1-score: 0.3715


In [7]:
f1 = f1_score(y_test, preds, average='weighted')
print(f'F1-score: {f1:.4f}')

F1-score: 0.3715
