In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from statsmodels.api import OLS, add_constant

from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer

#from sklearn.linear_model import LinearRegression

from sklearn.linear_model import RidgeClassifier, LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB, CategoricalNB

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef


Postures = pd.read_csv("Postures.csv")

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(y_test, preds,average='micro'))
    print("The Recall is: %7.4f"    % recall_score(y_test, preds,average='micro'))
    print("The F1 score is: %7.4f"  % f1_score(y_test, preds,average='micro'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print()
    print("This is the Confusion Matrix")
    display(pd.DataFrame(confusion_matrix(truth, preds)))

# 1) Processing the Data Set

In [2]:
# Eliminate first instance of Postures (all 0's) 
df = Postures.iloc[1:]

#Removing the variables with a proportion of missing values more than 80% 
for col in df.columns:
    proportion = (df[col] == '?').mean()*100
    if proportion > 80:
        df=df.drop(col, axis=1)
        
        
# Replace all '?' to NaN, so that the values are valid for Imputation
for col in df.columns:
    df.loc[df[col] == '?', col] = np.nan
    
    
# Extract from the Data Set the X and Y
# WARNING: For testing purposes, only work with a small sub-set of the original Data Set
#         Should be replaced for the whole Data Set in the act of Delivery 
X= df.values[0:10000,1:38]    
y= df['Class'].values[0:10000]

# Instatiate a KNN Imputater
imputer = KNNImputer(n_neighbors=2, weights="uniform")

# Acquire a new DataFrame with Imputated Values 
Xt=pd.DataFrame(imputer.fit_transform(X))
    
# Divide the whole Set into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=25)

# Scaling the data
scaler   = PowerTransformer()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

# 3) Linear Models

In [3]:
X_tr = add_constant(X_train)
mdl=OLS(y_train,X_tr, hasconst=12).fit()
mdl.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.594
Model:,OLS,Adj. R-squared:,0.592
Method:,Least Squares,F-statistic:,352.0
Date:,"Sun, 19 Nov 2023",Prob (F-statistic):,0.0
Time:,18:29:10,Log-Likelihood:,-10296.0
No. Observations:,7500,AIC:,20660.0
Df Residuals:,7468,BIC:,20880.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8700,0.011,259.734,0.000,2.848,2.892
x1,-0.5180,0.014,-37.392,0.000,-0.545,-0.491
x2,-0.1859,0.015,-12.649,0.000,-0.215,-0.157
x3,0.2918,0.015,19.185,0.000,0.262,0.322
x4,-0.3486,0.018,-19.588,0.000,-0.383,-0.314
x5,-0.2216,0.014,-15.559,0.000,-0.250,-0.194
x6,0.3205,0.015,21.450,0.000,0.291,0.350
x7,-0.2953,0.018,-16.599,0.000,-0.330,-0.260
x8,-0.2767,0.014,-19.183,0.000,-0.305,-0.248

0,1,2,3
Omnibus:,402.172,Durbin-Watson:,2.052
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1243.25
Skew:,0.228,Prob(JB):,1.08e-270
Kurtosis:,4.942,Cond. No.,4.24


## 3.1) Logistic Regression

In [4]:
# Create various lists with different Hyperparameter values
solvers = ['newton-cg', 'lbfgs', 'liblinear','sag','saga']
penalty = ['l2','l1','elasticnet', None]
c_values = [100, 10, 1.0, 0.1, 0.01]

best_solver = ''
best_solver_value = 0

best_C = ''
best_C_value = 0

# Find the best Solver Hyperparameter
for solver in solvers:
    # Create Logistic Regression Model using a specific Solver
    LR = LogisticRegression(max_iter=100000, solver=solver).fit(X_train, y_train)

    # Get Accuracy
    preds = LR.predict(X_test)
    a = accuracy_score(y_test, preds)
    if a > best_solver_value:
        best_solver_value = a
        best_solver = solver
    print("Solver:", solver, "| accuracy_score:", a)

print('»Best Solver:', best_solver, ':', best_solver_value)
print('###')  


for c in c_values:
    # Create Logistic Regression Model using a specific C value
    LR = LogisticRegression(max_iter=100000, C=c).fit(X_train, y_train)

    # Get Accuracy
    preds = LR.predict(X_test)
    a = accuracy_score(y_test, preds)
    if a > best_C_value:
        best_C_value = a
        best_C = c
    print("C:", c, "| accuracy_score:", a)
    
print('»Best C:', best_C, ':', best_C_value)
print('###')

# Create a Logistic Regression Model with the best Hyperparameters found
LR = LogisticRegression(max_iter=100000, solver =best_solver, C=best_C).fit(X_train, y_train)

# Get Accuracy
preds = LR.predict(X_test)
print('»» Best(?) Model Accuracy:' , accuracy_score(y_test, preds), '|Solver:', best_solver, '|C value:', best_C)

Solver: newton-cg | accuracy_score: 0.8776
Solver: lbfgs | accuracy_score: 0.8776
Solver: liblinear | accuracy_score: 0.868
Solver: sag | accuracy_score: 0.8776
Solver: saga | accuracy_score: 0.8776
»Best Solver: newton-cg : 0.8776
###
C: 100 | accuracy_score: 0.8784
C: 10 | accuracy_score: 0.878
C: 1.0 | accuracy_score: 0.8776
C: 0.1 | accuracy_score: 0.8768
C: 0.01 | accuracy_score: 0.8704
»Best C: 100 : 0.8784
###
»» Best(?) Model Accuracy: 0.8784 |Solver: newton-cg |C value: 100


In [5]:
# Create the final Logistic Regression Model
LR = LogisticRegression(max_iter=100000, solver ='newton-cg', C=100).fit(X_train, y_train)

# Show all the Model Evaluation Statistics for the best Hyperparameters found  
preds = LR.predict(X_test)
printClassResults(y_test, preds)

The Accuracy is:  0.8784
The Precision is:  0.8784
The Recall is:  0.8784
The F1 score is:  0.8784
The Matthews correlation coefficient is:  0.8458

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,648,13,55,4,6
1,1,304,13,4,30
2,51,23,364,6,0
3,11,5,3,412,28
4,9,27,11,4,468


In [6]:
# Present the Bias and the Coefficients of the Model 
print("The bias is: ",  LR.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(LR.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))

The bias is:  1.5024085649957584
The other parameters are: 
	 B01 ->     2.588
	 B02 ->     0.246
	 B03 ->    -0.997
	 B04 ->     0.371
	 B05 ->     0.060
	 B06 ->    -1.029
	 B07 ->     0.064
	 B08 ->     0.062
	 B09 ->    -0.988
	 B10 ->     0.013
	 B11 ->    -0.228
	 B12 ->    -1.043
	 B13 ->    -0.318
	 B14 ->    -0.208
	 B15 ->    -0.860
	 B16 ->    -0.362
	 B17 ->     0.109
	 B18 ->    -1.177
	 B19 ->    -0.159
	 B20 ->     0.028
	 B21 ->    -0.733
	 B22 ->    -0.244
	 B23 ->     0.007
	 B24 ->    -0.771
	 B25 ->    -0.096
	 B26 ->     0.012
	 B27 ->     0.463
	 B28 ->    -0.070
	 B29 ->     0.061
	 B30 ->     0.202
	 B31 ->    -0.149


In [7]:
# Present the Coefficients with the greatest impact
coefs=[(abs(beta),i) for i, beta in enumerate(LR.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

	 B01 ->     2.588
	 B18 ->     1.177
	 B12 ->     1.043
	 B06 ->     1.029
	 B03 ->     0.997


## 3.2) Ridge Classifier

In [8]:
# Create various lists with different Hyperparameter values

solvers = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
alphas = [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000, 10000]
          
best_solver = ''
best_solver_value = 0
          
best_alpha = ''
best_alpha_value = 0

# Find the best Solver Hyperparameter
for solver in solvers:
    # Create a Ridge Classifier Model using a specific Solver
    RC = RidgeClassifier(solver=solver).fit(X_train, y_train)

    # Get Accuracy
    preds = RC.predict(X_test)
    a = accuracy_score(y_test, preds)
    if a > best_solver_value:
        best_solver_value = a
        best_solver = solver
    print("Solver:", solver, "| accuracy_score:", a)

print('»Best Solver:', best_solver, ':', best_solver_value)
print('###')  


for alpha in alphas:
    # Create a Ridge Classifier Model using a specific Alpha Value
    RC = RidgeClassifier(alpha=alpha).fit(X_train, y_train)

    # Get Accuracy
    preds = RC.predict(X_test)
    a = accuracy_score(y_test, preds)
    if a > best_alpha_value:
        best_alpha_value = a
        best_alpha = alpha
    print("Alpha:", alpha, "| accuracy_score:", a)
    
print('»Best Alpha:', best_alpha, ':', best_alpha_value)
print('###')

# Create a Ridge Classifier Model with the best Hyperparameters found
RC = RidgeClassifier(solver =best_solver, alpha=best_alpha).fit(X_train, y_train)

# Get Accuracy
preds = RC.predict(X_test)
print('»» Best(?) Model Accuracy:' , accuracy_score(y_test, preds), '|Solver:', best_solver, '|Alpha value:', best_alpha)

Solver: auto | accuracy_score: 0.8316
Solver: svd | accuracy_score: 0.8316
Solver: cholesky | accuracy_score: 0.8316
Solver: lsqr | accuracy_score: 0.8308
Solver: sparse_cg | accuracy_score: 0.8316
Solver: sag | accuracy_score: 0.8316
Solver: saga | accuracy_score: 0.8316
»Best Solver: auto : 0.8316
###
Alpha: 0.0001 | accuracy_score: 0.8316
Alpha: 0.001 | accuracy_score: 0.8316
Alpha: 0.01 | accuracy_score: 0.8316
Alpha: 0.1 | accuracy_score: 0.8316
Alpha: 1.0 | accuracy_score: 0.8316
Alpha: 10 | accuracy_score: 0.8316
Alpha: 100 | accuracy_score: 0.8316
Alpha: 1000 | accuracy_score: 0.8364
Alpha: 10000 | accuracy_score: 0.8264
»Best Alpha: 1000 : 0.8364
###
»» Best(?) Model Accuracy: 0.8364 |Solver: auto |Alpha value: 1000


In [9]:
# Create the final Ridge Classifier Model
RC = RidgeClassifier(solver=best_solver, alpha=best_alpha).fit(X_train, y_train)

# Show all the Model Evaluation Statistics for the best Hyperparameters found  
preds = RC.predict(X_test)
printClassResults(y_test, preds)

The Accuracy is:  0.8364
The Precision is:  0.8364
The Recall is:  0.8364
The F1 score is:  0.8364
The Matthews correlation coefficient is:  0.7932

This is the Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,633,19,45,20,9
1,0,298,20,2,32
2,66,36,338,1,3
3,13,8,1,421,16
4,4,38,32,44,401


In [10]:
# Present the Bias and the Coefficients of the Model
print("The bias is: ",  RC.intercept_[0])
print("The other parameters are: ")
for i, beta in enumerate(RC.coef_[0]):
    print("\t B%02d -> %9.3f"% (i+1, beta))

The bias is:  -0.4405333333333332
The other parameters are: 
	 B01 ->     0.304
	 B02 ->     0.030
	 B03 ->    -0.109
	 B04 ->     0.024
	 B05 ->     0.015
	 B06 ->    -0.131
	 B07 ->     0.001
	 B08 ->     0.047
	 B09 ->    -0.126
	 B10 ->    -0.001
	 B11 ->     0.015
	 B12 ->    -0.133
	 B13 ->    -0.018
	 B14 ->    -0.009
	 B15 ->    -0.102
	 B16 ->    -0.028
	 B17 ->     0.011
	 B18 ->    -0.195
	 B19 ->     0.013
	 B20 ->     0.031
	 B21 ->    -0.093
	 B22 ->    -0.054
	 B23 ->     0.067
	 B24 ->    -0.130
	 B25 ->     0.031
	 B26 ->     0.006
	 B27 ->     0.048
	 B28 ->    -0.081
	 B29 ->    -0.013
	 B30 ->     0.029
	 B31 ->    -0.093


In [11]:
# Present the Coefficients with the greatest impact 
coefs=[(abs(beta),i) for i, beta in enumerate(RC.coef_[0])]
coefs.sort()
coefs.reverse()
for beta, i in coefs[:5]:
    print("\t B%02d -> %9.3f"% (i+1, beta))

	 B01 ->     0.304
	 B18 ->     0.195
	 B12 ->     0.133
	 B06 ->     0.131
	 B24 ->     0.130
