## DV2599 Assignment 1
Group 8


***
Viktor Fransson

vifr22@student.bth.se

***

Tobias Gustafsson

togu22@student.bth.se
***

#### Initialize

In [56]:
# Import packages
import pandas as pd
import matplotlib.pyplot as mpl # Version 3.8.4
import seaborn as sb
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('winequality-white.csv', delimiter=";")

#### 1. Inspect the dataset

In [21]:
# Describe
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


#### 2. Split into train and test sets

In [22]:
# Set quality as target
y = df['quality']
x = df.drop('quality', axis=1)

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#### 3. Scaling

In [23]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform train set
x_train_scaled = scaler.fit_transform(x_train)

# Transform test set
x_test_scaled = scaler.transform(x_test)

# Turn back into pandas dataframes
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)

# Describe
x_train_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0
mean,0.285688,0.193317,0.20039,0.185527,0.100683,0.229902,0.385328,0.296579,0.425932,0.32055,0.406953
std,0.081568,0.097071,0.073807,0.161064,0.064056,0.115929,0.125734,0.125522,0.136486,0.135772,0.198644
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.23301,0.127451,0.156627,0.035484,0.071856,0.145329,0.295522,0.19793,0.336364,0.22619,0.241935
50%,0.281553,0.176471,0.186747,0.145161,0.092814,0.221453,0.373134,0.285468,0.418182,0.297619,0.387097
75%,0.330097,0.235294,0.228916,0.3,0.113772,0.297578,0.471642,0.383355,0.509091,0.380952,0.548387
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### 4. Repeated k-Fold Cross Validation

In [24]:
# Initialize Repeated k-Fold Cross Validator
rkf = RepeatedKFold(n_splits=3, n_repeats=10)

# Random forest classifier and Logistic regression
classifiers = {
    "Random Forest" : RandomForestClassifier(),
    "Decision Tree" : DecisionTreeClassifier()
}

# Perform cross-validation
results = {}
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, x_train_scaled, y_train, cv=rkf, scoring="accuracy")

    results[name] = {
        "average" : round(cv_scores.mean(), 4),
        "standard deviation" : round(cv_scores.std(), 4)
    }

results

{'Random Forest': {'average': 0.6428, 'standard deviation': 0.0096},
 'Decision Tree': {'average': 0.5498, 'standard deviation': 0.0145}}

#### 5. Final model

In [25]:
# Random forest was the best classifier
rf_model = RandomForestClassifier()

rf_model.fit(x_train_scaled, y_train)

#### 6. Performance on test set

In [26]:
# Run model on test set
y_pred = rf_model.predict(x_test_scaled)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6806122448979591

#### 7. Balance scaled train set

In [37]:
# Apply SMOTE to train set to balance
smote = SMOTE(k_neighbors=3) # k_neigbors < smallest class size (4)

x_train_bal, y_train_bal = smote.fit_resample(x_train_scaled, y_train)

# Check class distribution
class_counts = pd.Series(y_train).value_counts()
# print(class_counts)
balanced_class_counts = pd.Series(y_train_bal).value_counts()
# print(balanced_class_counts)

# Calculate class ratios
class_ratios = class_counts / len(y_train)
bal_class_ratios = balanced_class_counts / len(y_train)
print("\nClass ratios:")
print(class_ratios)
print("\nBalanced:")
print(bal_class_ratios)


Class ratios:
quality
6    0.449464
5    0.295048
7    0.179428
8    0.037264
4    0.033691
3    0.004084
9    0.001021
Name: count, dtype: float64

Balanced:
quality
5    0.449464
6    0.449464
7    0.449464
4    0.449464
8    0.449464
9    0.449464
3    0.449464
Name: count, dtype: float64


#### 8. Validation and fitting on balanced set

In [38]:
# Perform cross-validation
results = {}
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, x_train_bal, y_train_bal, cv=rkf, scoring="accuracy")

    results[name] = {
        "average" : round(cv_scores.mean(), 4),
        "standard deviation" : round(cv_scores.std(), 4)
    }

results

{'Random Forest': {'average': 0.8916, 'standard deviation': 0.0041},
 'Decision Tree': {'average': 0.8163, 'standard deviation': 0.0071}}

In [39]:
# Random forest was the best classifier again
rf_model_bal = RandomForestClassifier()

rf_model_bal.fit(x_train_scaled, y_train)

#### 9. Performance of model trained on balanced set

In [None]:
# Run model on test set
y_pred = rf_model_bal.predict(x_test_scaled)

# Check for improvement
accuracy = accuracy_score(y_pred, y_test)
# f1 = f1_score(y_pred, y_test, average="weighted")
# recall = recall_score(y_pred, y_test, average="weighted")
# precision = precision_score(y_pred, y_test, average="weighted")
print(accuracy)

0.6908163265306122 0.7025709953110856 0.6908163265306122 0.7346570667393626


  _warn_prf(average, modifier, msg_start, len(result))
