## DV2599 Assignment 1
Group 8


***
Viktor Fransson

vifr22@student.bth.se

***

Tobias Gustafsson

togu22@student.bth.se
***

#### Initialize

In [60]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('winequality-white.csv', delimiter=";")

#### 1. Inspect the dataset

In [61]:
# Describe
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


#### 2. Split into train and test sets

In [62]:
# Set quality as target
y = df['quality']
x = df.drop('quality', axis=1)

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#### 3. Scaling

In [63]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform train set
x_train_scaled = scaler.fit_transform(x_train)

# Transform test set
x_test_scaled = scaler.transform(x_test)

# Turn back into pandas dataframes
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)

# Describe
x_train_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0
mean,0.294824,0.19337,0.202485,0.089292,0.108702,0.115714,0.298371,0.133107,0.424934,0.306417,0.407514
std,0.082088,0.099893,0.07298,0.07848,0.064605,0.059143,0.098782,0.058076,0.137544,0.13598,0.197636
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.240385,0.127451,0.162651,0.016871,0.080119,0.073171,0.230233,0.088573,0.336364,0.211765,0.241935
50%,0.288462,0.176471,0.192771,0.070552,0.10089,0.111498,0.288372,0.12758,0.409091,0.282353,0.387097
75%,0.336538,0.235294,0.23494,0.142638,0.121662,0.15331,0.367442,0.172999,0.509091,0.376471,0.548387
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### 4. Repeated k-Fold Cross Validation

In [64]:
# Initialize Repeated k-Fold Cross Validator
rkf = RepeatedKFold(n_splits=3, n_repeats=10)

# Random forest classifier and Logistic regression
classifiers = {
    "Random Forest" : RandomForestClassifier(),
    "Decision Tree" : DecisionTreeClassifier()
}

# Perform cross-validation
results = {}
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, x_train_scaled, y_train, cv=rkf, scoring="accuracy")

    results[name] = {
        "average" : round(cv_scores.mean(), 4),
        "standard deviation" : round(cv_scores.std(), 4)
    }

results

{'Random Forest': {'average': 0.6372, 'standard deviation': 0.0109},
 'Decision Tree': {'average': 0.5503, 'standard deviation': 0.0148}}

#### 5. Final model

In [65]:
# Random forest was the best classifier
rf_model = RandomForestClassifier()

rf_model.fit(x_train_scaled, y_train)

#### 6. Performance on test set

In [None]:
# Run model on test set
y_pred = rf_model.predict(x_test_scaled)

# Check accuracy
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.7112244897959183

#### 7. Balance scaled train set

In [67]:
# Apply SMOTE to train set to balance
smote = SMOTE(k_neighbors=3) # k_neigbors < smallest class size (4)

x_train_bal, y_train_bal = smote.fit_resample(x_train_scaled, y_train)

# Check class distribution
class_counts = pd.Series(y_train).value_counts()
# print(class_counts)
balanced_class_counts = pd.Series(y_train_bal).value_counts()
# print(balanced_class_counts)

# Calculate class ratios
class_ratios = class_counts / len(y_train)
bal_class_ratios = balanced_class_counts / len(y_train)
print("\nClass ratios:")
print(class_ratios)
print("\nBalanced:")
print(bal_class_ratios)


Class ratios:
quality
6    0.451251
5    0.292241
7    0.181215
8    0.036753
4    0.033435
3    0.004084
9    0.001021
Name: count, dtype: float64

Balanced:
quality
8    0.451251
7    0.451251
6    0.451251
5    0.451251
4    0.451251
3    0.451251
9    0.451251
Name: count, dtype: float64


#### 8. Validation and fitting on balanced set

In [68]:
# Perform cross-validation (may take a while)
results = {}
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, x_train_bal, y_train_bal, cv=rkf, scoring="accuracy")

    results[name] = {
        "average" : round(cv_scores.mean(), 4),
        "standard deviation" : round(cv_scores.std(), 4)
    }

results

{'Random Forest': {'average': 0.8888, 'standard deviation': 0.0046},
 'Decision Tree': {'average': 0.8127, 'standard deviation': 0.0064}}

In [69]:
# Random forest was the best classifier again
rf_model_bal = RandomForestClassifier()

rf_model_bal.fit(x_train_scaled, y_train)

#### 9. Performance of model trained on balanced set

In [71]:
# Run model on test set
y_pred = rf_model_bal.predict(x_test_scaled)

# Check for improvement
accuracy_bal = accuracy_score(y_pred, y_test)
print("Accuracy improvement:", round(accuracy_bal - accuracy, 3))

Accuracy improvement: -0.011
