In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from statsmodels.api import OLS, add_constant

from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer

from sklearn.linear_model import RidgeClassifier, LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB, CategoricalNB

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef


Postures = pd.read_csv("Postures.csv")

def printClassResults(truth, preds):
    
    #Column Names (a.k.a Possible Classes)
    classes = np.array(['Class 1','Class 2','Class 3','Class 4','Class 5'])
    
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(y_test, preds,average='micro'))
    print("The Recall is: %7.4f"    % recall_score(y_test, preds,average='micro'))
    print("The F1 score is: %7.4f"  % f1_score(y_test, preds,average='micro'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print()
    print("This is the Confusion Matrix")
    n_cols = len(confusion_matrix(truth, preds))
    cm = pd.DataFrame(confusion_matrix(truth, preds), columns=classes[0:n_cols], index=classes[0:n_cols])
    display(cm)

# 1) Processing the Data Set

In [2]:
# Eliminate first instance of Postures (all 0's) 
df = Postures.iloc[1:]

#Removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)
        
        
# Replace all '?' to NaN, so that the values are valid for Imputation
for col in df.columns:
    df.loc[df[col] == '?', col] = np.nan
    
    
# Extract from the Data Set the X and Y
# WARNING: For testing purposes, only work with a small sub-set of the original Data Set
#         Should be replaced for the whole Data Set in the act of Delivery

X= df.values[0:10000,1:len(df.columns)]    
y= df['Class'].values[0:10000]

# Instatiate a KNN Imputater
imputer = KNNImputer(n_neighbors=2, weights="uniform")

# Acquire a new DataFrame with Imputated Values 
Xt=imputer.fit_transform(X)
    


# Divide the whole Set into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=25)

# 4) Naive Bayes

In [4]:
#Negative values cannot be passed to CategoricalNB, sowe must use a scaler that doesn't produce negative values
# Scale the data so it can be used in Naive Bayes Models
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a Gaussian Naive Bayes Model with the scaled data
gnb=GaussianNB()
gnb.fit(X_train, y_train)

# Present the Results
preds=gnb.predict(X_test)
print()
print('################################ Gaussian Naive Bayes #########################################')
print()
printClassResults(y_test, preds)
print()

#########################################################################
print()
print('################################ Categorical Naive Bayes #########################################')
print()
alphas = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000, 10000]
best_alpha = -1
best_alpha_value = -1

for alpha in alphas:
    cnb=CategoricalNB(alpha=alpha)
    cnb=cnb.fit(X_train,y_train)
    preds=cnb.predict(X_test)
    a = accuracy_score(y_test, preds)
    if a > best_alpha_value:
        best_alpha_value = a
        best_alpha = alpha
    print("Alpha:", alpha, "| accuracy_score:", a)

print()        
print('Best Alpha:', best_alpha, "Best Alpha Accuracy:", best_alpha_value)
print()
#################################################################

# Create a Categorical Naive Bayes Model with the scaled data
cnb=CategoricalNB(alpha=best_alpha)
cnb.fit(X_train,y_train)

# Present the Results
preds=cnb.predict(X_test)
printClassResults(y_test, preds)


################################ Gaussian Naive Bayes #########################################

The Accuracy is:  0.7252
The Precision is:  0.7252
The Recall is:  0.7252
The F1 score is:  0.7252
The Matthews correlation coefficient is:  0.6872

This is the Confusion Matrix


Unnamed: 0,Class 1,Class 2,Class 3,Class 4,Class 5
Class 1,235,4,348,72,67
Class 2,0,344,4,0,4
Class 3,1,19,400,12,12
Class 4,0,7,2,438,12
Class 5,0,51,52,20,396




################################ Categorical Naive Bayes #########################################

Alpha: 0.0001 | accuracy_score: 0.3016
Alpha: 0.001 | accuracy_score: 0.3016
Alpha: 0.01 | accuracy_score: 0.3016
Alpha: 0.1 | accuracy_score: 0.3016
Alpha: 1.0 | accuracy_score: 0.3016
Alpha: 10 | accuracy_score: 0.3012
Alpha: 100 | accuracy_score: 0.2904
Alpha: 1000 | accuracy_score: 0.2904
Alpha: 10000 | accuracy_score: 0.2904

Best Alpha: 0.0001 Best Alpha Accuracy: 0.3016

The Accuracy is:  0.3016
The Precision is:  0.3016
The Recall is:  0.3016
The F1 score is:  0.3016
The Matthews correlation coefficient is:  0.2333

This is the Confusion Matrix


Unnamed: 0,Class 1,Class 2,Class 3,Class 4,Class 5
Class 1,235,1,0,0,490
Class 2,0,2,0,0,350
Class 3,0,0,0,0,444
Class 4,0,0,0,0,459
Class 5,0,2,0,0,517
