### Importing

In [16]:
pip install seaborn scikit-learn matplotlib

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.svm import SVC

SyntaxError: invalid syntax (1037930010.py, line 1)

### Pre-processing

In [13]:
# Load data
training_set = pd.read_parquet("ML_Hackathon/ml_data_train_holdout/holdout_set.parquet")
testing_set = pd.read_parquet("ML_Hackathon/ml_data_train_holdout/train_set.parquet")

# Sample data
sampled_training_set = training_set.sample(n=100000, random_state=42)
sampled_testing_set = testing_set.sample(n=100000, random_state=42)

# Explode labels before any other preprocessing
training_set_exploded = sampled_training_set.explode('labels')
testing_set_exploded = sampled_testing_set.explode('labels')

# Option 1: Drop missing values
# training_set_clean = training_set_exploded.dropna().copy()
# testing_set_clean = testing_set_exploded.dropna().copy()

# Option 2: Impute missing values (using mean strategy as an example)
imputer = SimpleImputer(strategy='mean')
training_set_imputed = imputer.fit_transform(training_set_exploded[['x', 'y', 'z']])
testing_set_imputed = imputer.transform(testing_set_exploded[['x', 'y', 'z']])

# Add the imputed data back to the DataFrame
training_set_exploded[['x', 'y', 'z']] = training_set_imputed
testing_set_exploded[['x', 'y', 'z']] = testing_set_imputed

# Drop duplicates
training_set_clean = training_set_exploded.drop_duplicates()
testing_set_clean = testing_set_exploded.drop_duplicates()

# Define IQR filter function
def remove_outliers(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Remove outliers
training_set_filtered = remove_outliers(training_set_clean, ['x', 'y', 'z'])
testing_set_filtered = remove_outliers(testing_set_clean, ['x', 'y', 'z'])

# Scale the values using the training data scaler
# scaler = StandardScaler()
# training_set_filtered[['x', 'y', 'z']] = scaler.fit_transform(training_set_filtered[['x', 'y', 'z']])
# testing_set_filtered[['x', 'y', 'z']] = scaler.transform(testing_set_filtered[['x', 'y', 'z']])

# Print a sample of the processed datasets
print("Training Set Sample:")
print(training_set_filtered.head())

print("\nTesting Set Sample:")
print(testing_set_filtered.head())

Training Set Sample:
          timestamp         x         y         z labels  \
29359888    120.580 -0.312500  0.468750 -0.765625   1-C2   
560531      561.140  0.593750  0.062500  0.703125    1-2   
1350056     769.570 -0.588867 -0.836914  0.325195    1-1   
28289292    774.946  0.062500 -0.843750 -0.968750    2-4   
38360539   2429.551 -0.940430 -0.395508  0.435791    1-2   

                             filename  
29359888  korra_ga_20150807_1.parquet  
560531      abe_ga_20150505_1.parquet  
1350056   annie_ga_20150227_3.parquet  
28289292   jose_ga_20150528_1.parquet  
38360539  oscar_ga_20150128_1.parquet  

Testing Set Sample:
          timestamp         x         y         z labels  \
16822003    757.530 -0.092773 -0.808838 -0.648926      H   
51747       412.624  0.656250 -0.312500 -0.937500   NULL   
32760826    460.330 -0.593750 -1.140625 -0.531250    2-4   
32760826    460.330 -0.593750 -1.140625 -0.531250   27-0   
19163301     61.349  0.632813 -0.233643  0.663818   20-0 

### Data Exploration

In [14]:
# Count and print unique labels
unique_labels = training_set_filtered['labels'].unique()
print("Unique labels in training set:", unique_labels)

# Count occurrence of each label
label_counts = training_set_filtered['labels'].value_counts()
print("\nLabel counts in training set:")
print(label_counts)

# print each label and count to decide which actions we want
for label, count in label_counts.items():
    print(label, count)

Unique labels in training set: ['1-C2' '1-2' '1-1' '2-4' '1-A2' 'NULL' '40-2' '2-0' '5-5' '29-3' '19-1'
 'P' '1-A1' 'H' '3-2' '36-0' '1-C1' '20-0' '5-1' '3-3' '1-B2' '1-C' '43-0'
 'H ' '21-1D' '1-B1' '5-3' 'SM' '1-B1  ' '3-1' '21-2' '21-1A' '35-1'
 '22-1' '12-B1' '22-2' '35-0' '27-0' '12-C2' '1-C2  ' '29-4' '21-5' '1-U'
 '2-7' '40-6' '19-2' '2-6A' ' 1-C1 ' '33-0' '21-4' '29-1' '1-3' '3-0'
 '42-0' '21-1C' '4-2' '21-1' '12-B2' '1-2 ' '4-0' '29-0' '23-3' '29-2'
 '21-0' '44-0' '23-2' '26-2' '30-1' '46-0' '12-A1' 'Unknown label' '45-0'
 '1-1   ' '2-6' '38-0' '35-2' '21-1B' ' 1-2 ' '1-2  ' '2-5' '2-3B' '42-0 '
 'S' '3-4' '26-1' '23-1' '2-3A' '4-1' '5-2' '20-0 ' 'X1' '26-0' '2-0 '
 '43-1' ' 4-2' '48-0' '2-2' '40-5' '2-4 ' '1-5' '28-0' '1-1 ' '3-5' '2-1'
 '40-1' '2-3' ' 2-0' '00' '5-4' '29-5' '32-0' '30-0' '25-1' '41-0']

Label counts in training set:
NULL    30486
1-2     15578
2-0     10239
2-4      4369
1-1      3565
        ...  
29-5        1
32-0        1
2-3A        1
25-1        1
41-0