In [183]:
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss
import numpy as np
from imblearn.over_sampling import SMOTE


In [184]:
''''Reads the csv file and loads it into a pandas dataframe, then visualizes the first 5 rows of the dataframe'''

wine_data = pd.read_csv('red.csv', delimiter=';')
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [185]:
'''Dropps the duplicates from the dataframe in order to avoid overfitting, then splits the data into the features and the target variable
    The the data gets split into training and testing data using the train_test_split function from the sklearn library
'''
wine_data.drop_duplicates(inplace=True)

X = wine_data.drop(columns="quality")
y = wine_data["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=None)


In [186]:
''' The data gets scaled using the StandardScaler from the sklearn library, then the data gets transformed using the fit_transform method
    Then the values of the target variable are counted and printed in order to compare it with the balanced data in task 7'''
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(y_train.value_counts())


quality
5    471
6    431
7    124
4     38
8     15
3      8
Name: count, dtype: int64


In [187]:
'''Declair the two classifiers (Linear Discriminant Analysis and Random Forest) that will be used in the cross validation, then the cross validation is performed using the RepeatedKFold
    function from the sklearn library, in order to determin the best classifier for the data. The results are stored in a dictionary.'''

classifiers = {
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Random Forest": RandomForestClassifier()
}
cv = RepeatedKFold(n_splits=3, n_repeats=10, random_state=42)

results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X_train_scaled, y_train, cv=cv, scoring='accuracy')
    results[name] = {
        "mean_accuracy": np.mean(scores),
        "std_accuracy": np.std(scores)
    }

In [188]:
'''visual representation of the results in order to compare the classifiers. also the best classifier is determined by comparing the mean accuracy of the classifiers. 
    The mean value of the best classifier is saved in a variable in order to compare it with the balanced data in task 7'''

for name, result in results.items():
    print(name + ":")
    print("Mean Accuracy:",result['mean_accuracy'])
    print("Standard Devation Accuracy:",result['std_accuracy'],"\n")
best_mean = results["Random Forest"]["mean_accuracy"]
if results["Linear Discriminant Analysis"]["mean_accuracy"] > results["Random Forest"]["mean_accuracy"]:
    best_mean = results["Linear Discriminant Analysis"]["mean_accuracy"]

best_classifier = max(results, key=lambda k: results[k]['mean_accuracy'])
print(f"Best Classifier: {best_classifier}")

Linear Discriminant Analysis:
Mean Accuracy: 0.5759891735029856
Standard Devation Accuracy: 0.02173949069154749 

Random Forest:
Mean Accuracy: 0.591067125296156
Standard Devation Accuracy: 0.027559844375016067 

Best Classifier: Random Forest


In [189]:
'''The best classifier is then used to fit the training data and predict the test data, then the accuracy of the model is calculated and printed. A classification report is also crated in order to view the results'''

best_classifier = classifiers[best_classifier]  

final_model = best_classifier.fit(X_train_scaled, y_train)
y_pred = best_classifier.predict(X_test_scaled) 
test_accuracy = final_model.score(X_test_scaled, y_test)

print(f"Test Accuracy of the final model ({best_classifier}): {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy of the final model (RandomForestClassifier()): 0.5699

Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        15
           5       0.59      0.75      0.66       106
           6       0.52      0.62      0.57       104
           7       0.77      0.23      0.36        43
           8       0.00      0.00      0.00         2

    accuracy                           0.57       272
   macro avg       0.31      0.27      0.27       272
weighted avg       0.55      0.57      0.53       272



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [193]:
''' The data is run again but this time balanced using the SMOTE function from the imblearn library, then the data is fitted to the classifiers and the cross validation is performed again in order to determine the best classifier.
    The results are stored in a dictionary and then compared with the previous results in order to determine if the balancing of the data improved the accuracy of the model'''

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

classifiers = {
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Random Forest": RandomForestClassifier()
}

cv = RepeatedKFold(n_splits=3, n_repeats=10, random_state=42)

results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X_train_res, y_train_res, cv=cv, scoring='accuracy')
    results[name] = {
        "mean_accuracy": np.mean(scores),
        "std_accuracy": np.std(scores)
    }

for name, result in results.items():
    print("Balanced "+ name + ":")
    print("Mean Accuracy:",result['mean_accuracy'])
    print("Standard Devation Accuracy:",result['std_accuracy'],"\n")
best_mean_res = results["Random Forest"]["mean_accuracy"]
if results["Linear Discriminant Analysis"]["mean_accuracy"] > results["Random Forest"]["mean_accuracy"]:
    best_mean_res = results["Linear Discriminant Analysis"]["mean_accuracy"]


best_classifier = max(results, key=lambda k: results[k]['mean_accuracy'])
print(f"Best Classifier: {best_classifier}")

best_classifier = classifiers[best_classifier]  

final_model = best_classifier.fit(X_train_res, y_train_res)


test_accuracy_res = final_model.score(X_test_scaled, y_test)
print(f"Test Accuracy of the final balanced model ({best_classifier}): {test_accuracy_res:.4f}\n")

if test_accuracy_res > test_accuracy:
    print("By balanceing the scaled train set the predict accuracy improved by", test_accuracy_res - test_accuracy)
else:
    print("By balanceing the scaled train set the predict accuracy decreased by", test_accuracy - test_accuracy_res)
if best_mean_res > best_mean:
    print("By balanceing the scaled train set the Mean Accuracy accuracy improved by", best_mean_res - best_mean)
else:
    print("By balanceing the scaled train set the Mean Accuracy accuracy decreased by", best_mean - best_mean_res)

Balanced Linear Discriminant Analysis:
Mean Accuracy: 0.5906935598018401
Standard Devation Accuracy: 0.01584003410382199 

Balanced Random Forest:
Mean Accuracy: 0.826963906581741
Standard Devation Accuracy: 0.010759814572162274 

Best Classifier: Random Forest
Test Accuracy of the final balanced model (RandomForestClassifier()): 0.5441

By balanceing the scaled train set the predict accuracy decreased by 0.02573529411764708
By balanceing the scaled train set the Mean Accuracy accuracy improved by 0.235896781285585
