## DV2599 Assignment 1
Group 8


***
Viktor Fransson

vifr22@student.bth.se

***

Tobias Gustafsson

togu22@student.bth.se
***

#### Initialize

In [57]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('winequality-white.csv', delimiter=";")

#### 1. Inspect the dataset

In [None]:
# Describe
# df.describe()
# df.info()

# Calculate ratios of quality classes
class_counts = df['quality'].value_counts().sort_index()

print("Quality Class Ratios:")
for class_label, count in class_counts.items():
    ratio = count / len(df)
    print(f"Class {class_label}: {ratio:.2%}")


# Calculate ratio of feature values per quality class
# Get all feature columns
feature_columns = [col for col in df.columns if col != 'quality']

# Create copy of df to change and keep original intact
df_discretisized = df.copy()

for feature in feature_columns:
    df_discretisized[feature] = pd.qcut(df[feature], q=3, labels=['low', 'medium', 'high'])

# Analyze each target class
print("\nTarget Class Feature Analysis:")
for class_label in sorted(df_discretisized['quality'].unique()):
    print(f"\nClass {class_label}:")
    
    # Subset df for the current class
    class_df = df_discretisized[df_discretisized['quality'] == class_label]
    
    # Find the most frequent feature value for each feature
    for feature in feature_columns:
        top_values = class_df[feature].value_counts().head(2)
        print(f"  {feature}:")
        for value, count in top_values.items():
            print(f"    {value}: {count/len(class_df):.1%}")

Quality Class Ratios:
Class 3: 0.41%
Class 4: 3.33%
Class 5: 29.75%
Class 6: 44.88%
Class 7: 17.97%
Class 8: 3.57%
Class 9: 0.10%

Target Class Feature Analysis:

Class 3:
  fixed acidity:
    high: 50.0%
    low: 25.0%
  volatile acidity:
    high: 45.0%
    medium: 30.0%
  citric acid:
    high: 40.0%
    low: 30.0%
  residual sugar:
    high: 40.0%
    low: 35.0%
  chlorides:
    low: 40.0%
    high: 35.0%
  free sulfur dioxide:
    low: 45.0%
    high: 30.0%
  total sulfur dioxide:
    high: 55.0%
    low: 35.0%
  density:
    high: 40.0%
    medium: 35.0%
  pH:
    low: 35.0%
    medium: 35.0%
  sulphates:
    low: 45.0%
    high: 30.0%
  alcohol:
    medium: 45.0%
    low: 35.0%

Class 4:
  fixed acidity:
    high: 39.9%
    low: 34.4%
  volatile acidity:
    high: 57.7%
    medium: 27.0%
  citric acid:
    low: 48.5%
    high: 31.9%
  residual sugar:
    low: 48.5%
    medium: 30.1%
  chlorides:
    high: 42.9%
    medium: 30.1%
  free sulfur dioxide:
    low: 69.3%
    medium: 

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,medium,medium,medium,high,medium,high,high,high,low,medium,low,6
1,low,medium,medium,low,high,low,medium,medium,high,medium,low,6
2,high,medium,high,medium,high,medium,low,medium,high,medium,medium,6
3,high,low,medium,high,high,high,high,high,medium,low,medium,6
4,high,low,medium,high,high,high,high,high,medium,low,medium,6
5,high,medium,high,medium,high,medium,low,medium,high,medium,medium,6
6,low,high,low,medium,medium,medium,medium,medium,medium,medium,low,6
7,medium,medium,medium,high,medium,high,high,high,low,medium,low,6
8,low,medium,medium,low,high,low,medium,medium,high,medium,low,6
9,high,low,high,low,medium,medium,medium,medium,medium,medium,medium,6


#### 2. Split into train and test sets

In [59]:
# Set quality as target
y = df['quality']
x = df.drop('quality', axis=1)

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#### 3. Scaling

In [60]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform train set
x_train_scaled = scaler.fit_transform(x_train)

# Transform test set
x_test_scaled = scaler.transform(x_test)

# Turn back into pandas dataframes
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)

# Describe
x_train_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0
mean,0.294008,0.194108,0.201289,0.088367,0.108259,0.112437,0.300131,0.133222,0.426748,0.307444,0.405957
std,0.081416,0.099829,0.07285,0.07796,0.062373,0.059622,0.099879,0.057976,0.137018,0.135005,0.197941
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.240385,0.127451,0.162651,0.016871,0.080119,0.06993,0.229698,0.08849,0.336364,0.211765,0.241935
50%,0.288462,0.176471,0.192771,0.070169,0.10089,0.108392,0.290023,0.127048,0.418182,0.294118,0.387097
75%,0.336538,0.235294,0.228916,0.141104,0.121662,0.15035,0.366589,0.173318,0.509091,0.376471,0.548387
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### 4. Repeated k-Fold Cross Validation

In [61]:
# Initialize Repeated k-Fold Cross Validator
rkf = RepeatedKFold(n_splits=3, n_repeats=10)

# Random forest classifier and Logistic regression
classifiers = {
    "Random Forest" : RandomForestClassifier(),
    "Decision Tree" : DecisionTreeClassifier()
}

# Perform cross-validation
results = {}
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, x_train_scaled, y_train, cv=rkf, scoring="accuracy")

    results[name] = {
        "average" : round(cv_scores.mean(), 4),
        "standard deviation" : round(cv_scores.std(), 4)
    }

results

{'Random Forest': {'average': 0.6452, 'standard deviation': 0.0104},
 'Decision Tree': {'average': 0.5574, 'standard deviation': 0.0129}}

#### 5. Final model

In [62]:
# Random forest was the best classifier
rf_model = RandomForestClassifier()

rf_model.fit(x_train_scaled, y_train)

#### 6. Performance on test set

In [63]:
# Run model on test set
y_pred = rf_model.predict(x_test_scaled)

# Check accuracy
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.6846938775510204

#### 7. Balance scaled train set

In [64]:
# Apply SMOTE to train set to balance
smote = SMOTE(k_neighbors=3) # k_neigbors < smallest class size (4)

x_train_bal, y_train_bal = smote.fit_resample(x_train_scaled, y_train)

# Check class distribution
class_counts = pd.Series(y_train).value_counts()
# print(class_counts)
balanced_class_counts = pd.Series(y_train_bal).value_counts()
# print(balanced_class_counts)

# Calculate class ratios
class_ratios = class_counts / len(y_train)
bal_class_ratios = balanced_class_counts / len(y_train)
print("\nClass ratios:")
print(class_ratios)
print("\nBalanced:")
print(bal_class_ratios)


Class ratios:
quality
6    0.448188
5    0.296325
7    0.182746
8    0.034201
4    0.032925
3    0.004594
9    0.001021
Name: count, dtype: float64

Balanced:
quality
6    0.448188
8    0.448188
7    0.448188
5    0.448188
4    0.448188
9    0.448188
3    0.448188
Name: count, dtype: float64


#### 8. Validation and fitting on balanced set

In [65]:
# Perform cross-validation (may take a while)
results = {}
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, x_train_bal, y_train_bal, cv=rkf, scoring="accuracy")

    results[name] = {
        "average" : round(cv_scores.mean(), 4),
        "standard deviation" : round(cv_scores.std(), 4)
    }

results

{'Random Forest': {'average': 0.8886, 'standard deviation': 0.0044},
 'Decision Tree': {'average': 0.8096, 'standard deviation': 0.0058}}

In [66]:
# Random forest was the best classifier again
rf_model_bal = RandomForestClassifier()

rf_model_bal.fit(x_train_scaled, y_train)

#### 9. Performance of model trained on balanced set

In [67]:
# Run model on test set
y_pred = rf_model_bal.predict(x_test_scaled)

# Check for improvement
accuracy_bal = accuracy_score(y_pred, y_test)
print("Accuracy improvement:", round(accuracy_bal - accuracy, 3))

Accuracy improvement: -0.004
