In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.api import OLS, add_constant
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef



Postures = pd.read_csv("Postures.csv")

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(y_test, preds,average='micro'))
    print("The Recall is: %7.4f"    % recall_score(y_test, preds,average='micro'))
    print("The F1 score is: %7.4f"  % f1_score(y_test, preds,average='micro'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print()
    print("This is the Confusion Matrix")
    display(pd.DataFrame(confusion_matrix(truth, preds)))

# 1) Processing the Data Set

In [2]:
# Eliminate first instance of Postures (all 0's) 
df = Postures.iloc[1:]

for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 0:
        print(f'Proportion of missing values in column {col}: {round(proportion,2)}%')

Proportion of missing values in column X3: 0.88%
Proportion of missing values in column Y3: 0.88%
Proportion of missing values in column Z3: 0.88%
Proportion of missing values in column X4: 4.0%
Proportion of missing values in column Y4: 4.0%
Proportion of missing values in column Z4: 4.0%
Proportion of missing values in column X5: 16.68%
Proportion of missing values in column Y5: 16.68%
Proportion of missing values in column Z5: 16.68%
Proportion of missing values in column X6: 33.1%
Proportion of missing values in column Y6: 33.1%
Proportion of missing values in column Z6: 33.1%
Proportion of missing values in column X7: 50.13%
Proportion of missing values in column Y7: 50.13%
Proportion of missing values in column Z7: 50.13%
Proportion of missing values in column X8: 60.86%
Proportion of missing values in column Y8: 60.86%
Proportion of missing values in column Z8: 60.86%
Proportion of missing values in column X9: 69.31%
Proportion of missing values in column Y9: 69.31%
Proportion o

In [3]:
#removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)

In [4]:
#removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)
        
# Replace all '?' to NaN, so that the values are valid for Imputation
for col in df.columns:
    df.loc[df[col] == '?', col] = np.nan
        
X= df.drop(columns =[ 'Class' ]) 
y=df['Class']  

In [5]:
# Divide the whole Set into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=25)

In [7]:
# Instantiate a Simple Imputer (should come before scaling)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Acquire a new DataFrame with Imputed Values
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


In [9]:
# Scaling the data using PowerTransformer
scaler = PowerTransformer()
scaler.fit(X_train_imputed)
X_train_scaled = scaler.transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# 2) Tree Based Models

In [15]:
import time
from sklearn.impute import SimpleImputer

# Record the start time
start_time = time.time()


# Create a Decision Tree Model with the data
tree_mdl = DecisionTreeClassifier()

# Hyperparameter tuning using GridSearchCV
parameters = {
    'criterion': ['entropy'],
    'max_depth': [5, 8],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1]
}

grid_search = GridSearchCV(tree_mdl, parameters, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to create the final model
final_tree_mdl = DecisionTreeClassifier(**best_params)
final_tree_mdl.fit(X_train_scaled, y_train)
preds = final_tree_mdl.predict(X_test_scaled)

# Evaluate the results
printClassResults(y_test, preds)

# Cross-validated accuracy
cv_accuracy = cross_val_score(final_tree_mdl, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("Cross-validated Accuracy:", np.mean(cv_accuracy))

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")


Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 3}
The Accuracy is:  0.8472
The Precision is:  0.8472
The Recall is:  0.8472
The F1 score is:  0.8472
The Matthews correlation coefficient is:  0.8101

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,4039,8,9,15,1
1,9,3350,35,109,209
2,515,20,3091,452,37
3,72,51,517,2986,78
4,123,352,129,242,3075


Cross-validated Accuracy: 0.8410646500158748
Execution time: 43.633217334747314 seconds


# 3) Linear Models

In [None]:
# Create a Logistic Regression Model with the data
LR = LogisticRegression(random_state=0).fit(Xt_train, y_train)

# Present the Bias and the Betas
print("The bias is: ",  LR.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(LR.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))

In [None]:
X_tr = add_constant(Xt_train)
mdl=OLS(y_train,X_tr, hasconst=12).fit()
mdl.summary()

In [None]:
coefs=[(abs(beta),i) for i, beta in enumerate(LR.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

In [None]:
# Show the Results
preds = LR.predict(Xt_test)
printClassResults(y_test, preds)

In [None]:
# Create a Ridge Classifier Model with the data
RC = RidgeClassifier(random_state=0).fit(Xt_train, y_train)

# Present the Bias and the Betas
print("The bias is: ",  RC.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(RC.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))

In [None]:
coefs=[(abs(beta),i) for i, beta in enumerate(RC.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

In [None]:
# Show the Results
preds = RC.predict(Xt_test)
printClassResults(y_test, preds)

In [None]:
# To Do: The same for Lasso, Ridge aand Logistic Regression

# 4) Naive Bayes

In [None]:
#Column Names (a.k.a Possible Classes)
classes = np.array(['Class 1','Class 2','Class 3','Class 4','Class 5'])

# Create a Gaussian Naive Bayes Model with the scaled data
gnb=GaussianNB()
gnb.fit(Xt_train, y_train)

# Present the Results
preds=gnb.predict(Xt_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))


# Create a Categorical Naive Bayes Model with the scaled data
cnb=CategoricalNB()
cnb.fit(Xt_train,y_train)

# Present the Results
preds=cnb.predict(Xt_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))

# 5) K-Nearest Neighbours

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(Xt_train, y_train)

preds = knn.predict(Xt_test)

# Present the Results
accuracy = accuracy_score(y_test, preds)
print("Accuracy:", accuracy)

#Sholud try to make plots 