In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.api import OLS, add_constant
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef



Postures = pd.read_csv("Postures.csv")

# 1) Processing the Data Set

In [3]:
# Eliminate first instance of Postures (all 0's) 
df = Postures.iloc[1:]

for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 0:
        print(f'Proportion of missing values in column {col}: {round(proportion,2)}%')

Proportion of missing values in column X3: 0.88%
Proportion of missing values in column Y3: 0.88%
Proportion of missing values in column Z3: 0.88%
Proportion of missing values in column X4: 4.0%
Proportion of missing values in column Y4: 4.0%
Proportion of missing values in column Z4: 4.0%
Proportion of missing values in column X5: 16.68%
Proportion of missing values in column Y5: 16.68%
Proportion of missing values in column Z5: 16.68%
Proportion of missing values in column X6: 33.1%
Proportion of missing values in column Y6: 33.1%
Proportion of missing values in column Z6: 33.1%
Proportion of missing values in column X7: 50.13%
Proportion of missing values in column Y7: 50.13%
Proportion of missing values in column Z7: 50.13%
Proportion of missing values in column X8: 60.86%
Proportion of missing values in column Y8: 60.86%
Proportion of missing values in column Z8: 60.86%
Proportion of missing values in column X9: 69.31%
Proportion of missing values in column Y9: 69.31%
Proportion o

In [4]:
#removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)

In [5]:
# Replace all '?' to NaN, so that the values are valid for Imputation
for col in df.columns:
    df.loc[df[col] == '?', col] = np.nan
    
# Consider the User variable as categorical (even though it is numeric)
Postures['User'] = Postures['User'].astype('category')

# Instatiate a KNN Imputater
imputer = KNNImputer(n_neighbors=2, weights="uniform")

# Extract from the Data Set the X and Y
# WARNING: For testing purposes, only work with a small sub-set of the original Data Set
#         Should be replaced for the whole Data Set in the act of Delivery 
X= df.values[0:10000,1:38]    
y=df['Class'].values[0:10000]        
        

    
# Acquire a new DataFrame with Imputated Values 
Xt=pd.DataFrame(imputer.fit_transform(X))

# 2) Tree Based Models

In [30]:
import time

# Record the start time
start_time = time.time()


# Feature scaling (not strictly necessary for decision trees, but good practice for other algorithms)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(Xt)

# Divide the whole Set into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=25)

# Create a Decision Tree Model with the data
tree_mdl = DecisionTreeClassifier()

# Hyperparameter tuning using GridSearchCV (taking ~53 seconds)
parameters = {
    'criterion': ['entropy'],
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1]
}

grid_search = GridSearchCV(tree_mdl, parameters, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to create the final model
final_tree_mdl = DecisionTreeClassifier(**best_params)
final_tree_mdl.fit(X_train, y_train)
preds = final_tree_mdl.predict(X_test)

# Evaluate the results
print("The Precision is: %7.4f" % precision_score(y_test, preds, average='micro'))
print("The Recall is: %7.4f" % recall_score(y_test, preds, average='micro'))
print("The F1 score is: %7.4f" % f1_score(y_test, preds, average='micro'))
print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
print()
print("This is the Confusion Matrix")
print(pd.DataFrame(confusion_matrix(y_test, preds)))

# Cross-validated accuracy
cv_accuracy = cross_val_score(final_tree_mdl, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated Accuracy:", np.mean(cv_accuracy))


# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2}
The Precision is:  0.9488
The Recall is:  0.9488
The F1 score is:  0.9488
The Matthews correlation coefficient is:  0.9350

This is the Confusion Matrix
     0    1    2    3    4
0  718    2    3    2    1
1    3  334    0    0   15
2   13    7  409    4   11
3    1    1    4  438   15
4    5   20   10   11  473
Cross-validated Accuracy: 0.9363999999999999
Execution time: 7.5226218700408936 seconds


# 3) Linear Models

In [None]:
# Create a Linear Regression Model with the data
reg = LinearRegression().fit(X_train, y_train)

# Present the Biases
print("The bias is: ",  reg.intercept_)
print("The other parameters are: ")
for i, beta in enumerate(reg.coef_):
    print("\t B%d -> %9.3f"% (i+1, beta))

In [None]:
# Plot the Results
preds=reg.predict(X_test)
plt.figure(figsize=(10,5))
plt.scatter(preds, y_test)
plt.xlabel('Predictions')
plt.ylabel('Testing Set')
plt.grid()
plt.show()

In [None]:
X_tr = add_constant(X_train)
mdl=OLS(y_train,X_tr, hasconst=12).fit()
mdl.summary()

In [None]:
# To Do: The same for Lasso, Ridge aand Logistic Regression

# 4) Naive Bayes

In [None]:
# Scale the data so it can be used in Naive Bayes Models
scaler = MinMaxScaler() #Maybe use different Scalers (See info in TP06)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Column Names (a.k.a Possible Classes)
classes = np.array(['Class 1','Class 2','Class 3','Class 4','Class 5'])

# Create a Gaussian Naive Bayes Model with the scaled data
gnb=GaussianNB()
gnb.fit(X_train, y_train)

# Present the Results
preds=gnb.predict(X_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))


# Create a Categorical Naive Bayes Model with the scaled data
cnb=CategoricalNB()
cnb.fit(X_train,y_train)

# Present the Results
preds=cnb.predict(X_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))

# 5) K-Nearest Neighbours

In [None]:
# Divide the whole Set into Training and Testing Sets to be scaled with a different Scaler
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=25)

# Scale the data so it can be used in K-Nearest Neighbours Models
scaler = StandardScaler() #Maybe use different Scalers (See info in TP06)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

preds = knn.predict(X_test)

# Present the Results
accuracy = accuracy_score(y_test, preds)
print("Accuracy:", accuracy)

#Sholud try to make plots 