In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from statsmodels.api import OLS, add_constant

from sklearn.impute import KNNImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler

#from sklearn.linear_model import LinearRegression

from sklearn.linear_model import RidgeClassifier, LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB, CategoricalNB

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef


Postures = pd.read_csv("Postures.csv")

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(y_test, preds,average='micro'))
    print("The Recall is: %7.4f"    % recall_score(y_test, preds,average='micro'))
    print("The F1 score is: %7.4f"  % f1_score(y_test, preds,average='micro'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print()
    print("This is the Confusion Matrix")
    display(pd.DataFrame(confusion_matrix(truth, preds)))

# 1) Processing the Data Set

In [2]:
# Eliminate first instance of Postures (all 0's) 
df = Postures.iloc[1:]

for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 0:
        print(f'Proportion of missing values in column {col}: {round(proportion,2)}%')

Proportion of missing values in column X3: 0.88%
Proportion of missing values in column Y3: 0.88%
Proportion of missing values in column Z3: 0.88%
Proportion of missing values in column X4: 4.0%
Proportion of missing values in column Y4: 4.0%
Proportion of missing values in column Z4: 4.0%
Proportion of missing values in column X5: 16.68%
Proportion of missing values in column Y5: 16.68%
Proportion of missing values in column Z5: 16.68%
Proportion of missing values in column X6: 33.1%
Proportion of missing values in column Y6: 33.1%
Proportion of missing values in column Z6: 33.1%
Proportion of missing values in column X7: 50.13%
Proportion of missing values in column Y7: 50.13%
Proportion of missing values in column Z7: 50.13%
Proportion of missing values in column X8: 60.86%
Proportion of missing values in column Y8: 60.86%
Proportion of missing values in column Z8: 60.86%
Proportion of missing values in column X9: 69.31%
Proportion of missing values in column Y9: 69.31%
Proportion o

In [3]:
#removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)

In [4]:
# Replace all '?' to NaN, so that the values are valid for Imputation
for col in df.columns:
    df.loc[df[col] == '?', col] = np.nan

# Instatiate a KNN Imputater
imputer = KNNImputer(n_neighbors=2, weights="uniform")

# Extract from the Data Set the X and Y
# WARNING: For testing purposes, only work with a small sub-set of the original Data Set
#         Should be replaced for the whole Data Set in the act of Delivery 
X= df.values[0:10000,1:38]    
y=df['Class'].values[0:10000]        
        

    
# Acquire a new DataFrame with Imputated Values 
Xt=pd.DataFrame(imputer.fit_transform(X))

# 2) Tree Based Models

In [5]:
# Divide the whole Set into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=25)

# Create a Decision Tree Model with the data
tree_mdl = DecisionTreeClassifier()
tree_mdl.fit(X_train, y_train)
preds = tree_mdl.predict(X_test)

# Present the results
print("The Precision is: %7.4f" % precision_score(y_test, preds,average='micro'))
print("The Recall is: %7.4f"    % recall_score(y_test, preds,average='micro'))
print("The F1 score is: %7.4f"  % f1_score(y_test, preds,average='micro'))
print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
print()
print("This is the Confusion Matrix")
pd.DataFrame(confusion_matrix(y_test, preds))

# Maybe should try with different Hyperparameters (split, leaf, criterion, depth, etc.)
# Look into 'TP03' for better data evaluation and ideas

The Precision is:  0.9504
The Recall is:  0.9504
The F1 score is:  0.9504
The Matthews correlation coefficient is:  0.9371

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,705,2,11,3,5
1,2,328,7,2,13
2,7,6,420,6,5
3,2,0,7,442,8
4,6,17,7,8,481


# 3) Linear Models

In [6]:
# Divide the whole Set into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=25)

# Create a Linear Regression Model with the data
#reg = LinearRegression().fit(X_train, y_train)

# Present the Biases
#print("The bias is: ",  reg.intercept_)
#print("The other parameters are: ")
#for i, beta in enumerate(reg.coef_):
#    print("\t B%d -> %9.3f"% (i+1, beta))
    
#This Linear Regression cell doesn't make sense. This is a Classification problem, not a Regression one!  

In [7]:
# Plot the Results
#preds=reg.predict(X_test)
#plt.figure(figsize=(10,5))
#plt.scatter(preds, y_test)
#plt.xlabel('Predictions')
#plt.ylabel('Testing Set')
#plt.grid()
#plt.show()

#This Linear Regression cell doesn't make sense. This is a Classification problem, not a Regression one!

In [8]:
#X_tr = add_constant(X_train)
#mdl=OLS(y_train,X_tr, hasconst=12).fit()
#mdl.summary()

In [9]:
# To Do: The same for Ridge(Classifier) aand Logistic Regression

In [10]:

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

LR = LogisticRegression(random_state=0).fit(X_train, y_train)

print("The bias is: ",  LR.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(LR.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))
    


The bias is:  1.1394663283998505
The other parameters are: 
	 B01 ->     1.646
	 B02 ->     0.164
	 B03 ->    -1.015
	 B04 ->     0.336
	 B05 ->     0.002
	 B06 ->    -1.016
	 B07 ->    -0.029
	 B08 ->     0.009
	 B09 ->    -0.969
	 B10 ->    -0.061
	 B11 ->    -0.254
	 B12 ->    -1.039
	 B13 ->    -0.377
	 B14 ->    -0.222
	 B15 ->    -0.878
	 B16 ->    -0.382
	 B17 ->     0.117
	 B18 ->    -1.187
	 B19 ->    -0.099
	 B20 ->     0.003
	 B21 ->    -0.729
	 B22 ->    -0.178
	 B23 ->     0.005
	 B24 ->    -0.799
	 B25 ->    -0.000
	 B26 ->     0.041
	 B27 ->     0.450
	 B28 ->    -0.007
	 B29 ->     0.095
	 B30 ->     0.203
	 B31 ->    -0.145


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
coefs=[(abs(beta),i) for i, beta in enumerate(LR.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

	 B01 ->     1.646
	 B18 ->     1.187
	 B12 ->     1.039
	 B06 ->     1.016
	 B03 ->     1.015


In [12]:
preds = LR.predict(X_test)
printClassResults(y_test, preds)

The Accuracy is:  0.8760
The Precision is:  0.8760
The Recall is:  0.8760
The F1 score is:  0.8760
The Matthews correlation coefficient is:  0.8427

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,652,8,46,5,15
1,1,296,17,5,33
2,48,23,368,5,0
3,12,5,3,409,30
4,9,30,11,4,465


In [13]:
RC = RidgeClassifier(random_state=0).fit(X_train, y_train)

print("The bias is: ",  RC.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(RC.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))

The bias is:  -0.44053333333333333
The other parameters are: 
	 B01 ->     0.341
	 B02 ->     0.033
	 B03 ->    -0.120
	 B04 ->     0.011
	 B05 ->     0.016
	 B06 ->    -0.137
	 B07 ->    -0.019
	 B08 ->     0.047
	 B09 ->    -0.134
	 B10 ->    -0.006
	 B11 ->     0.006
	 B12 ->    -0.146
	 B13 ->    -0.021
	 B14 ->    -0.021
	 B15 ->    -0.102
	 B16 ->    -0.038
	 B17 ->     0.008
	 B18 ->    -0.212
	 B19 ->     0.033
	 B20 ->     0.034
	 B21 ->    -0.103
	 B22 ->    -0.047
	 B23 ->     0.082
	 B24 ->    -0.158
	 B25 ->     0.057
	 B26 ->     0.005
	 B27 ->     0.053
	 B28 ->    -0.098
	 B29 ->    -0.015
	 B30 ->     0.034
	 B31 ->    -0.114


In [14]:
coefs=[(abs(beta),i) for i, beta in enumerate(RC.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

	 B01 ->     0.341
	 B18 ->     0.212
	 B24 ->     0.158
	 B12 ->     0.146
	 B06 ->     0.137


In [15]:
preds = RC.predict(X_test)
printClassResults(y_test, preds)

#Linear Models Section still needs to be refined

The Accuracy is:  0.8324
The Precision is:  0.8324
The Recall is:  0.8324
The F1 score is:  0.8324
The Matthews correlation coefficient is:  0.7878

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,628,17,45,22,14
1,0,290,24,1,37
2,65,32,341,1,5
3,12,9,1,417,20
4,5,24,37,48,405


# 4) Naive Bayes

In [16]:
# Scale the data so it can be used in Naive Bayes Models
scaler = MinMaxScaler() #Maybe use different Scalers (See info in TP06)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Column Names (a.k.a Possible Classes)
classes = np.array(['Class 1','Class 2','Class 3','Class 4','Class 5'])

# Create a Gaussian Naive Bayes Model with the scaled data
gnb=GaussianNB()
gnb.fit(X_train, y_train)

# Present the Results
preds=gnb.predict(X_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))


# Create a Categorical Naive Bayes Model with the scaled data
cnb=CategoricalNB()
cnb.fit(X_train,y_train)

# Present the Results
preds=cnb.predict(X_test)
print("The Accuracy score is: ", accuracy_score(y_test, preds))
print()
print('Confusion Matrix:')
display(pd.DataFrame(confusion_matrix(y_test, preds), columns=classes, index=classes))

The Accuracy score is:  0.7252

Confusion Matrix:


Unnamed: 0,Class 1,Class 2,Class 3,Class 4,Class 5
Class 1,235,4,348,72,67
Class 2,0,344,4,0,4
Class 3,1,19,400,12,12
Class 4,0,7,2,438,12
Class 5,0,51,52,20,396


The Accuracy score is:  0.3016

Confusion Matrix:


Unnamed: 0,Class 1,Class 2,Class 3,Class 4,Class 5
Class 1,235,1,0,0,490
Class 2,0,2,0,0,350
Class 3,0,0,0,0,444
Class 4,0,0,0,0,459
Class 5,0,2,0,0,517


# 5) K-Nearest Neighbours

In [17]:
# Divide the whole Set into Training and Testing Sets to be scaled with a different Scaler
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=25)

# Scale the data so it can be used in K-Nearest Neighbours Models
scaler = StandardScaler() #Maybe use different Scalers (See info in TP06)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

preds = knn.predict(X_test)

# Present the Results
accuracy = accuracy_score(y_test, preds)
print("Accuracy:", accuracy)

#Sholud try to make plots 

Accuracy: 0.936
