# **Machine Learning Project #2**
## Evaluation of Logistic Regression Performance in Multi-Class Classification

---

Mohsen Shayeghi

**under the supervision of : Dr.Khosravi**

# imported libraries that are needed

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier , OneVsOneClassifier
from sklearn.feature_selection import SelectKBest
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


# dataset imported

In [50]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
car_evaluation = fetch_ucirepo(id=19)

# data (as pandas dataframes)
X = car_evaluation.data.features
y = car_evaluation.data.targets

# metadata
print(car_evaluation.metadata)

# variable information
print(car_evaluation.variables)


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

# Detecting missing values

In [47]:
F_null_values = X.isnull().sum()
print(F_null_values)
T_null_values = y.isnull().sum()
print(T_null_values)

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
dtype: int64
class    0
dtype: int64


# dropping null values


In [48]:
X = X.dropna()
y = y.dropna()

# Label Encoding

In [51]:
X_Encoded = X.copy()
y_Encoded = y.copy()
X_Encoded['buying'] = X_Encoded['buying'].replace({'vhigh': 4, 'high': 3, 'med': 2, 'low': 1})
X_Encoded['maint'] = X_Encoded['maint'].replace({'vhigh': 4, 'high': 3, 'med': 2, 'low': 1})
X_Encoded['doors'] = X_Encoded['doors'].replace({'2': 2, '3': 3, '4': 4, '5more': 5})
X_Encoded['persons'] = X_Encoded['persons'].replace({'2': 2, '4': 4, 'more': 6})
X_Encoded['lug_boot'] = X_Encoded['lug_boot'].replace({'small': 1, 'med': 2, 'big': 3})
X_Encoded['safety'] = X_Encoded['safety'].replace({'low': 1, 'med': 2, 'high': 3})
y_Encoded['class'] = y_Encoded['class'].replace({'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3})

  X_Encoded['buying'] = X_Encoded['buying'].replace({'vhigh': 4, 'high': 3, 'med': 2, 'low': 1})
  X_Encoded['maint'] = X_Encoded['maint'].replace({'vhigh': 4, 'high': 3, 'med': 2, 'low': 1})
  X_Encoded['doors'] = X_Encoded['doors'].replace({'2': 2, '3': 3, '4': 4, '5more': 5})
  X_Encoded['persons'] = X_Encoded['persons'].replace({'2': 2, '4': 4, 'more': 6})
  X_Encoded['lug_boot'] = X_Encoded['lug_boot'].replace({'small': 1, 'med': 2, 'big': 3})
  X_Encoded['safety'] = X_Encoded['safety'].replace({'low': 1, 'med': 2, 'high': 3})
  y_Encoded['class'] = y_Encoded['class'].replace({'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3})


# Removing outliers using Z-score

In [52]:
from scipy import stats
print(f"number of instances Before Removing outliers: {X_Encoded.shape[0]}")
X_zscore = X_Encoded[(np.abs(stats.zscore(X_Encoded)) < 3).all(axis=1)] # Setting threshold and filter outliers
# remove outliers from both datasets
X_Encoded = X_Encoded[X_Encoded.index.isin(X_zscore.index)]
y_Encoded = y_Encoded[y_Encoded.index.isin(X_zscore.index)]
print(f"number of instances After Removing outliers: {X_Encoded.shape[0]}")


number of instances Before Removing outliers: 1728
number of instances After Removing outliers: 1728


# Feature Engineering

In [53]:

X_Encoded['total_cost'] = X_Encoded['buying'] + X_Encoded['maint']


# Data Normalization using Min-Max Scaler

In [54]:
scaler = MinMaxScaler()
X_Normalized = scaler.fit_transform(X_Encoded)

print(X_Normalized[10:50])

[[1.         1.         0.         0.5        0.         0.5
  1.        ]
 [1.         1.         0.         0.5        0.         1.
  1.        ]
 [1.         1.         0.         0.5        0.5        0.
  1.        ]
 [1.         1.         0.         0.5        0.5        0.5
  1.        ]
 [1.         1.         0.         0.5        0.5        1.
  1.        ]
 [1.         1.         0.         0.5        1.         0.
  1.        ]
 [1.         1.         0.         0.5        1.         0.5
  1.        ]
 [1.         1.         0.         0.5        1.         1.
  1.        ]
 [1.         1.         0.         1.         0.         0.
  1.        ]
 [1.         1.         0.         1.         0.         0.5
  1.        ]
 [1.         1.         0.         1.         0.         1.
  1.        ]
 [1.         1.         0.         1.         0.5        0.
  1.        ]
 [1.         1.         0.         1.         0.5        0.5
  1.        ]
 [1.         1.         0.       

# Feature Selection

In [55]:
from sklearn.feature_selection import  chi2

selector = SelectKBest(chi2, k=6)
X_New = selector.fit_transform(X_Normalized, y_Encoded)
selected_feature_indices = selector.get_support(indices=True)
selected_features = X_Encoded.columns[selected_feature_indices]
print("Selected Features:", selected_features)
print("All Features:",X_Encoded.columns)
print("Scores: ", selector.scores_)

Selected Features: Index(['buying', 'maint', 'persons', 'lug_boot', 'safety', 'total_cost'], dtype='object')
All Features: Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety',
       'total_cost'],
      dtype='object')
Scores:  [ 44.85155054  33.41757442   2.29798983  89.0207604   17.13189085
 130.15180111  38.5332109 ]


# Split data for testing and training

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_New, y_Encoded, test_size=0.3, random_state=42)


# Training model with OvR , OvO and Softmax Regression

In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

Softmax_model = LogisticRegression(multi_class='multinomial', solver='saga')
Softmax_model.fit(X_train, y_train)
ovr_model = OneVsRestClassifier(LogisticRegression(multi_class='ovr'))
ovr_model.fit(X_train, y_train)
ovo_model = OneVsOneClassifier(LogisticRegression())
ovo_model.fit(X_train, y_train)

# Hyperparameter Tuning using GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(solver='saga'), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Parameters: {'C': 100, 'penalty': 'l1'}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# Evaluation criteria

In [58]:
softmax_accuracy = accuracy_score(y_test, Softmax_model.predict(X_test))
print("Softmax Accuracy:", softmax_accuracy * 100)
softmax_precision = precision_score(y_test, Softmax_model.predict(X_test), average='weighted')
print("Softmax Precision:", softmax_precision * 100)
softmax_recall = recall_score(y_test, Softmax_model.predict(X_test), average='weighted')
print("Softmax Recall:", softmax_recall * 100)
softmax_f1 = f1_score(y_test, Softmax_model.predict(X_test), average='weighted')
print("Softmax F1 Score:", softmax_f1 * 100)

ovr_accuracy = accuracy_score(y_test, ovr_model.predict(X_test))
print("OvR Accuracy:", ovr_accuracy * 100)
ovr_precision = precision_score(y_test, ovr_model.predict(X_test), average='weighted')
print("OvR Precision:", ovr_precision * 100)
ovr_recall = recall_score(y_test, ovr_model.predict(X_test), average='weighted')
print("OvR Recall:", ovr_recall * 100)
ovr_f1 = f1_score(y_test, ovr_model.predict(X_test), average='weighted')
print("OvR F1 Score:", ovr_f1 * 100)

ovo_accuracy = accuracy_score(y_test, ovo_model.predict(X_test))
print("OvO Accuracy:", ovo_accuracy * 100)
ovo_precision = precision_score(y_test, ovo_model.predict(X_test), average='weighted')
print("OvO Precision:", ovo_precision * 100)
ovo_recall = recall_score(y_test, ovo_model.predict(X_test), average='weighted')
print("OvO Recall:", ovo_recall * 100)
ovo_f1 = f1_score(y_test, ovo_model.predict(X_test), average='weighted')
print("OvO F1 Score:", ovo_f1 * 100)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best Model Accuracy:", accuracy * 100)
precision = precision_score(y_test, y_pred, average='weighted')
print("Best Model Precision:", precision * 100)
recall = recall_score(y_test, y_pred, average='weighted')
print("Best Model Recall:", recall * 100)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Best Model F1 Score:", f1 * 100)


Softmax Accuracy: 80.34682080924856
Softmax Precision: 79.08405699951942
Softmax Recall: 80.34682080924856
Softmax F1 Score: 79.3281262532769
OvR Accuracy: 78.03468208092485
OvR Precision: 77.23707142566603
OvR Recall: 78.03468208092485
OvR F1 Score: 75.1363146654321
OvO Accuracy: 81.5028901734104
OvO Precision: 80.62366079527419
OvO Recall: 81.5028901734104
OvO F1 Score: 80.52510211968908
Best Model Accuracy: 83.04431599229287
Best Model Precision: 82.34153592072667
Best Model Recall: 83.04431599229287
Best Model F1 Score: 82.5608768922256


# Confusion Matrix

In [59]:

softmax_cm = confusion_matrix(y_test, Softmax_model.predict(X_test))
print("Softmax Confusion Matrix:")
print(softmax_cm)
ovr_cm = confusion_matrix(y_test, ovr_model.predict(X_test))
print("OvR Confusion Matrix:")
print(ovr_cm)
ovo_cm = confusion_matrix(y_test, ovo_model.predict(X_test))
print("OvO Confusion Matrix:")
print(ovo_cm)
best_model_cm = confusion_matrix(y_test, best_model.predict(X_test))
print("Best Model Confusion Matrix:")
print(best_model_cm)


Softmax Confusion Matrix:
[[334  17   3   4]
 [ 48  68   2   0]
 [  2  11   6   0]
 [  0  14   1   9]]
OvR Confusion Matrix:
[[341  15   0   2]
 [ 58  60   0   0]
 [  3  15   1   0]
 [  0  21   0   3]]
OvO Confusion Matrix:
[[337  16   3   2]
 [ 47  69   2   0]
 [  2   9   8   0]
 [  0  14   1   9]]
Best Model Confusion Matrix:
[[333  18   3   4]
 [ 41  72   2   3]
 [  1   8   9   1]
 [  0   7   0  17]]


# Classification Report

In [60]:

softmax_cr = classification_report(y_test, Softmax_model.predict(X_test))
print("Softmax Classification Report:")
print(softmax_cr)
ovr_cr = classification_report(y_test, ovr_model.predict(X_test))
print("OvR Classification Report:")
print(ovr_cr)
ovo_cr = classification_report(y_test, ovo_model.predict(X_test))
print("OvO Classification Report:")
print(ovo_cr)
best_model_cr = classification_report(y_test, best_model.predict(X_test))
print("Best Model Classification Report:")
print(best_model_cr)

Softmax Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       358
           1       0.62      0.58      0.60       118
           2       0.50      0.32      0.39        19
           3       0.69      0.38      0.49        24

    accuracy                           0.80       519
   macro avg       0.67      0.55      0.59       519
weighted avg       0.79      0.80      0.79       519

OvR Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90       358
           1       0.54      0.51      0.52       118
           2       1.00      0.05      0.10        19
           3       0.60      0.12      0.21        24

    accuracy                           0.78       519
   macro avg       0.75      0.41      0.43       519
weighted avg       0.77      0.78      0.75       519

OvO Classification Report:
              precision    recall  f1-score  

# To enhance the performance of my model, I employed several optimization techniques:


 * ###  **Feature Engineering:** I engineered new features based on domain knowledge and interactions between existing features

 * ###  **Hyperparameter Tuning:** I utilized GridSearchCV to systematically explore and select optimal hyperparameters for my model.

 * ###  **Training Size Variation:** I experimented with different training set sizes to assess their impact on model performance.

### These efforts collectively resulted in a **5% increase in accuracy**, demonstrating the effectiveness of the applied optimization strategies.


