In [26]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , balanced_accuracy_score, f1_score, roc_auc_score
import time

In [27]:
df = pd.read_csv('data/dataFile.csv')
df

Unnamed: 0,seconds_elapsed,z_accelerometer,y_accelerometer,x_accelerometer,z_gravity,y_gravity,x_gravity,z_gyro,y_gyro,x_gyro,...,quaternionX_wristMotion,quaternionY_wristMotion,quaternionZ_wristMotion,z_magnetometer,y_magnetometer,x_magnetometer,user,relativeAltitude_barometer,pressure_barometer,magneticBearing_compass
0,-0.570446,,,,,,,,,,...,,,,,,,2,0.0,1014.732361,
1,-0.541341,,,,,,,,,,...,,,,,,,1,0.0,1016.061707,
2,-0.523916,,,,,,,,,,...,,,,,,,1,0.0,1015.169830,
3,-0.522268,,,,,,,,,,...,,,,,,,2,0.0,1014.458237,
4,-0.521896,,,,,,,,,,...,,,,,,,1,0.0,1015.913239,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170837,2906.188554,,,,,,,,,,...,0.186314,0.367319,0.485740,,,,1,,,
1170838,2906.198618,,,,,,,,,,...,0.186659,0.373277,0.500053,,,,1,,,
1170839,2906.208682,,,,,,,,,,...,0.186206,0.379189,0.513737,,,,1,,,
1170840,2906.218746,,,,,,,,,,...,0.185107,0.385247,0.526691,,,,1,,,


#### Defining the target and the features the model will use 

In [28]:
target = 'user'
features = df.columns.to_list()
features.remove('user')


In [29]:
# Convert time from datetime to unix, so the model can use it
df['seconds_elapsed'] = pd.to_datetime(df['seconds_elapsed']).astype('int64') // 10**9

X = df[features]
y= df[target]


# Split the data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42, stratify=y, test_size=0.3)

#### Now, we build our pipeline

In [30]:
# Separate categorical and numerical columns
from sklearn.impute import SimpleImputer


categorical_cols = X.select_dtypes(exclude=['number']).columns.to_list()
numerical_cols = X.select_dtypes(include=['number']).columns.to_list()


# Pipeline for the categorical data
categorical_pipeline = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')) 
])

# Pipeline for the numeric data
numeric_pipeline = Pipeline(steps=[
    ('imp', SimpleImputer()),
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_pipeline, categorical_cols),
    ('numerical', numeric_pipeline ,numerical_cols)
    ], remainder='passthrough')



In [32]:
# Assemble the preprocessor and the model together using a Pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
    ])
    
# Train the model 
model.fit(X_train, y_train)

# Get the predictions
y_pred = model.predict(X_test)

# Get the performance scores
acc = accuracy_score(y_true=y_test, y_pred=y_pred)
bcc = balanced_accuracy_score(y_true=y_test, y_pred=y_pred)
f1 = f1_score(y_true=y_test, y_pred=y_pred, average='micro')

print("Accuracy: ", acc)
print("Balanced Accuracy Score ", bcc)
print("F1 Score ", f1)

Accuracy:  0.9776571303305595
Balanced Accuracy Score  0.977936659641225
F1 Score  0.9776571303305595
