### Imports

In [9]:
import pandas as pd
import numpy as np
from sklearn import metrics
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_score, recall_score, f1_score,matthews_corrcoef, confusion_matrix, accuracy_score

### Functions

In [10]:
def scale01(x):
    return (x - x.min())/(x.max() - x.min())

def unscale01(x, lower, upper):
    return (x * upper) - ((x - 1.0) * lower)


def ClassResults(truth, preds, average='weighted'):
    print()
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, 
                                                      average=average, 
                                                      zero_division=1))
    print("The Recall is: %7.4f" % recall_score(truth, preds, average=average))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, average=average))
    print("The Matthews correlation coefficient is: %7.4f" % 
          matthews_corrcoef(truth,preds))
   



### Data processing

In [11]:
df = pd.read_csv("../data/train.csv", sep=";")
df["activity_bool"] = df.activity.astype('boolean')

data = np.array(df.values[: , 2:], dtype = float)   # Pandas dtype = object, logo tudo é permitido
(N, d) = data.shape

print (N,"x",d)

df

120618 x 8


Unnamed: 0,date,time,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,activity_bool
0,01/07/23,13:51:15:847724020,0,0.2650,-0.7814,-0.0076,-0.0590,0.0325,-2.9296,False
1,01/07/23,13:51:16:246945023,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269,False
2,01/07/23,13:51:16:446233987,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367,False
3,01/07/23,13:51:16:646117985,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336,False
4,01/07/23,13:51:16:846738994,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922,False
...,...,...,...,...,...,...,...,...,...,...
120613,10/08/23,15:9:9:183413982,0,-0.4990,-0.8191,-0.2748,0.8913,2.5946,-0.3912,False
120614,10/08/23,15:9:9:383414983,0,-0.3644,-1.2275,-0.3136,-1.0882,-0.8800,0.3812,False
120615,10/08/23,15:9:9:583639025,0,-0.4661,-1.3076,-0.1409,-1.4251,0.7642,1.3138,False
120616,10/08/23,15:9:9:783478021,0,-0.2795,-0.6653,-0.0531,-0.8411,-0.3201,1.1435,False


In [12]:
df.columns.to_list()

['date',
 'time',
 'activity',
 'acceleration_x',
 'acceleration_y',
 'acceleration_z',
 'gyro_x',
 'gyro_y',
 'gyro_z',
 'activity_bool']

In [13]:
#split the data into training and testing sets

X= df[['acceleration_x',
 'acceleration_y',
 'acceleration_z',
 'gyro_x',
 'gyro_y',
 'gyro_z',]]
y= df['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state=42)


# # Scale the training data
# X_train_scaled = scale01(X_train)

# # Scale the testing data
# X_test_scaled = scale01(X_test)

# Data preprocessing
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler(X_train)
X_test_scaled = scale01(X_test)

#scale the data
# scaler = StandardScaler()
# scaler.fit(X_train.iloc[:, 0:23])
# Xs_train = scaler.transform(X_train.iloc[:, 0:23])
# Xs_test = scaler.transform(X_test.iloc[:, 0:23])


#scale the data
# scaler = StandardScaler()
# scaledX = scaler.fit_transform(X)
# print(X)
# print(df)

#scale the data

#test to see if scaler works
# minv = data.min(0)
# maxv = data.max(0)

# print(data.min(0))
# print(data.max(0))

#Revert the scale 
# data2 = data.copy()
# for var in range(1, 7):
#    data2[:, var] = unscale01(data[:, var], minv[var], maxv[var])

# print(data2.min(0))
# print(data2.max(0))





### Random Florest

In [14]:
print("_______ Random Florest _______")

# RF= RandomForestClassifier(n_estimators = 10, criterion = 'gini') # criterion = 'entropy'
# RF.fit(X_train_scaled, y_train)

# ClassResults(y_test, preds)

from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=10, criterion = 'gini')

# Train the model using the scaled training data
random_forest.fit(X_train_scaled, y_train)

# Predict on the scaled test data
predictions_rf = random_forest.predict(X_test_scaled)

# Evaluate the Random Forest model using the ClassResults function or other evaluation metrics
ClassResults(y_test, predictions_rf)





_______ Random Florest _______

The Accuracy is:  0.6255
The Precision is:  0.8056
The Recall is:  0.6255
The F1 score is:  0.6010
The Matthews correlation coefficient is:  0.4400


### KNN

In [15]:
print("_______ KNN _______")
# Create a KNN Classifier (for example, with k=5)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model using the scaled training data
knn.fit(X_train_scaled, y_train)

# Predict on the scaled test data
predictions_knn = knn.predict(X_test_scaled)

# Evaluate the KNN model using the ClassResults function or other evaluation metrics
ClassResults(y_test, predictions_knn)



_______ KNN _______

The Accuracy is:  0.8298
The Precision is:  0.8797
The Recall is:  0.8298
The F1 score is:  0.8308
The Matthews correlation coefficient is:  0.7077


### SVM

In [16]:
print("_______ SVM _______")
svm_classifier = SVC(kernel='rbf', random_state=42)

# Train the model using the scaled training data
svm_classifier.fit(X_train_scaled, y_train)

# Predict on the scaled test data
predictions_svm = svm_classifier.predict(X_test_scaled)

# Evaluate the SVM model using the ClassResults function or other evaluation metrics
ClassResults(y_test, predictions_svm)

_______ SVM _______

The Accuracy is:  0.7449
The Precision is:  0.8441
The Recall is:  0.7449
The F1 score is:  0.7415
The Matthews correlation coefficient is:  0.5921
