In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.api import OLS, add_constant
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef



Postures = pd.read_csv("Postures.csv")

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(y_test, preds,average='micro'))
    print("The Recall is: %7.4f"    % recall_score(y_test, preds,average='micro'))
    print("The F1 score is: %7.4f"  % f1_score(y_test, preds,average='micro'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print()
    print("This is the Confusion Matrix")
    display(pd.DataFrame(confusion_matrix(truth, preds)))

# 1) Processing the Data Set

In [2]:
# Eliminate first instance of Postures (all 0's) 
df = Postures.iloc[1:]

for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 0:
        print(f'Proportion of missing values in column {col}: {round(proportion,2)}%')

Proportion of missing values in column X3: 0.88%
Proportion of missing values in column Y3: 0.88%
Proportion of missing values in column Z3: 0.88%
Proportion of missing values in column X4: 4.0%
Proportion of missing values in column Y4: 4.0%
Proportion of missing values in column Z4: 4.0%
Proportion of missing values in column X5: 16.68%
Proportion of missing values in column Y5: 16.68%
Proportion of missing values in column Z5: 16.68%
Proportion of missing values in column X6: 33.1%
Proportion of missing values in column Y6: 33.1%
Proportion of missing values in column Z6: 33.1%
Proportion of missing values in column X7: 50.13%
Proportion of missing values in column Y7: 50.13%
Proportion of missing values in column Z7: 50.13%
Proportion of missing values in column X8: 60.86%
Proportion of missing values in column Y8: 60.86%
Proportion of missing values in column Z8: 60.86%
Proportion of missing values in column X9: 69.31%
Proportion of missing values in column Y9: 69.31%
Proportion o

In [3]:
#removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)

In [4]:
#removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)
        
# Replace all '?' to NaN, so that the values are valid for Imputation
for col in df.columns:
    df.loc[df[col] == '?', col] = np.nan
        
X= df.drop(columns =[ 'Class' ]) 
y=df['Class']  

In [5]:
# Divide the whole Set into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=25)

In [6]:
# Scaling the data
scaler   = PowerTransformer()
scaler.fit(X_train)
Xt_train = scaler.transform(X_train)
Xt_test  = scaler.transform(X_test)

In [7]:
# Instatiate a Simple Imputater
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    
# Acquire a new DataFrame with Imputated Values
imputer.fit(Xt_train)
Xt_train = imputer.transform(Xt_train)
Xt_test  = imputer.transform(Xt_test)

# 2) Tree Based Models

In [8]:
import time

# Record the start time
start_time = time.time()

# Create a Decision Tree Model with the data
tree_mdl = DecisionTreeClassifier()

# Hyperparameter tuning using GridSearchCV (taking ~53 seconds)
parameters = {
    'criterion': ['entropy'],
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1]
}

grid_search = GridSearchCV(tree_mdl, parameters, scoring='accuracy', cv=5)
grid_search.fit(Xt_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to create the final model
final_tree_mdl = DecisionTreeClassifier(**best_params)
final_tree_mdl.fit(Xt_train, y_train)
preds = final_tree_mdl.predict(Xt_test)

# Evaluate the results
print("The Precision is: %7.4f" % precision_score(y_test, preds, average='micro'))
print("The Recall is: %7.4f" % recall_score(y_test, preds, average='micro'))
print("The F1 score is: %7.4f" % f1_score(y_test, preds, average='micro'))
print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
print()
print("This is the Confusion Matrix")
print(pd.DataFrame(confusion_matrix(y_test, preds)))

# Cross-validated accuracy
cv_accuracy = cross_val_score(final_tree_mdl, Xt_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated Accuracy:", np.mean(cv_accuracy))


# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 3}
The Precision is:  0.8804
The Recall is:  0.8804
The F1 score is:  0.8804
The Matthews correlation coefficient is:  0.8510

This is the Confusion Matrix
      0     1     2     3     4
0  4041     5     9    16     1
1     5  3404    23    60   220
2   207     6  3435   436    31
3    34    19   405  3181    65
4   116   293    85   300  3127
Cross-validated Accuracy: 0.8786087817408952
Execution time: 69.6269166469574 seconds


# 3) Linear Models

In [9]:
# Create a Logistic Regression Model with the data
LR = LogisticRegression(random_state=0).fit(Xt_train, y_train)

# Present the Bias and the Betas
print("The bias is: ",  LR.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(LR.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))

The bias is:  -0.46798649582440655
The other parameters are: 
	 B01 ->    -0.032
	 B02 ->    -0.991
	 B03 ->    -1.123
	 B04 ->    -0.517
	 B05 ->    -0.833
	 B06 ->    -1.188
	 B07 ->    -0.448
	 B08 ->    -0.754
	 B09 ->    -1.262
	 B10 ->    -0.231
	 B11 ->    -0.733
	 B12 ->    -1.419
	 B13 ->     0.053
	 B14 ->    -0.704
	 B15 ->    -1.421
	 B16 ->     0.092
	 B17 ->    -0.641
	 B18 ->    -1.595
	 B19 ->     0.400
	 B20 ->    -0.409
	 B21 ->    -1.493
	 B22 ->     0.738
	 B23 ->    -0.127
	 B24 ->    -0.547
	 B25 ->     0.278
	 B26 ->     0.340
	 B27 ->    -0.711
	 B28 ->     0.111
	 B29 ->     0.612
	 B30 ->    -0.562
	 B31 ->     0.119


In [10]:
X_tr = add_constant(Xt_train)
mdl=OLS(y_train,X_tr, hasconst=12).fit()
mdl.summary()

0,1,2,3
Dep. Variable:,Class,R-squared:,0.274
Model:,OLS,Adj. R-squared:,0.273
Method:,Least Squares,F-statistic:,711.5
Date:,"Sun, 19 Nov 2023",Prob (F-statistic):,0.0
Time:,17:17:38,Log-Likelihood:,-94342.0
No. Observations:,58571,AIC:,188700.0
Df Residuals:,58539,BIC:,189000.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9837,0.005,595.911,0.000,2.974,2.993
x1,-0.0062,0.005,-1.163,0.245,-0.017,0.004
x2,0.1826,0.006,32.846,0.000,0.172,0.193
x3,0.1527,0.006,24.684,0.000,0.141,0.165
x4,0.0811,0.007,10.943,0.000,0.067,0.096
x5,0.1020,0.006,18.138,0.000,0.091,0.113
x6,0.1864,0.006,29.574,0.000,0.174,0.199
x7,0.0670,0.008,8.543,0.000,0.052,0.082
x8,0.0594,0.006,10.437,0.000,0.048,0.071

0,1,2,3
Omnibus:,455.843,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,367.666
Skew:,0.122,Prob(JB):,1.45e-80
Kurtosis:,2.698,Cond. No.,5.67


In [11]:
coefs=[(abs(beta),i) for i, beta in enumerate(LR.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

	 B18 ->     1.595
	 B21 ->     1.493
	 B15 ->     1.421
	 B12 ->     1.419
	 B09 ->     1.262


In [12]:
# Show the Results
preds = LR.predict(Xt_test)
printClassResults(y_test, preds)

The Accuracy is:  0.7707
The Precision is:  0.7707
The Recall is:  0.7707
The F1 score is:  0.7707
The Matthews correlation coefficient is:  0.7136

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,3508,9,252,164,139
1,40,3270,58,76,268
2,390,7,3133,515,70
3,266,188,486,2359,405
4,123,495,125,400,2778


In [13]:
# Create a Ridge Classifier Model with the data
RC = RidgeClassifier(random_state=0).fit(Xt_train, y_train)

# Present the Bias and the Betas
print("The bias is: ",  RC.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(RC.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))

The bias is:  -0.5836506120776492
The other parameters are: 
	 B01 ->     0.003
	 B02 ->    -0.191
	 B03 ->    -0.141
	 B04 ->    -0.116
	 B05 ->    -0.140
	 B06 ->    -0.145
	 B07 ->    -0.086
	 B08 ->    -0.109
	 B09 ->    -0.133
	 B10 ->    -0.056
	 B11 ->    -0.099
	 B12 ->    -0.144
	 B13 ->    -0.014
	 B14 ->    -0.082
	 B15 ->    -0.133
	 B16 ->    -0.001
	 B17 ->    -0.042
	 B18 ->    -0.147
	 B19 ->     0.046
	 B20 ->     0.006
	 B21 ->    -0.152
	 B22 ->     0.121
	 B23 ->     0.025
	 B24 ->    -0.077
	 B25 ->     0.080
	 B26 ->     0.019
	 B27 ->    -0.140
	 B28 ->     0.034
	 B29 ->     0.005
	 B30 ->    -0.131
	 B31 ->     0.003


In [14]:
coefs=[(abs(beta),i) for i, beta in enumerate(RC.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

	 B02 ->     0.191
	 B21 ->     0.152
	 B18 ->     0.147
	 B06 ->     0.145
	 B12 ->     0.144


In [15]:
# Show the Results
preds = RC.predict(Xt_test)
printClassResults(y_test, preds)

The Accuracy is:  0.7214
The Precision is:  0.7214
The Recall is:  0.7214
The F1 score is:  0.7214
The Matthews correlation coefficient is:  0.6554

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,3711,53,150,1,157
1,31,3085,44,104,448
2,868,134,2742,314,57
3,407,442,627,1763,465
4,201,658,104,174,2784


In [16]:
# To Do: The same for Lasso, Ridge aand Logistic Regression

# 4) Naive Bayes

In [19]:
#Column Names (a.k.a Possible Classes)
classes = np.array(['Class 1','Class 2','Class 3','Class 4','Class 5'])

# Create a Gaussian Naive Bayes Model with the scaled data
gnb=GaussianNB()
gnb.fit(Xt_train, y_train)

# Present the Results
preds=gnb.predict(Xt_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))


# Create a Categorical Naive Bayes Model with the scaled data
cnb=CategoricalNB()
cnb.fit(Xt_train,y_train)

# Present the Results
preds=cnb.predict(Xt_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))

The Accuracy score is:  0.5098340503995082

Confusion Matrix:


Unnamed: 0,Class 1,Class 2,Class 3,Class 4,Class 5
Class 1,0,9,4063,0,0
Class 2,0,3148,97,76,391
Class 3,0,4,4092,19,0
Class 4,0,102,3159,391,52
Class 5,0,568,550,480,2323


ValueError: Negative values in data passed to CategoricalNB (input X)

# 5) K-Nearest Neighbours

In [18]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(Xt_train, y_train)

preds = knn.predict(Xt_test)

# Present the Results
accuracy = accuracy_score(y_test, preds)
print("Accuracy:", accuracy)

#Sholud try to make plots 

Accuracy: 0.9383835279655808
