In [183]:
import kagglehub
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Download latest version
path = kagglehub.dataset_download("taweilo/wine-quality-dataset-balanced-classification")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/wine-quality-dataset-balanced-classification


In [184]:
import pandas as pd
wine_data = pd.read_csv(path+"/wine_data.csv")

wine_data.info()
wine_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         21000 non-null  float64
 1   volatile_acidity      21000 non-null  float64
 2   citric_acid           21000 non-null  float64
 3   residual_sugar        21000 non-null  float64
 4   chlorides             21000 non-null  float64
 5   free_sulfur_dioxide   21000 non-null  float64
 6   total_sulfur_dioxide  21000 non-null  float64
 7   density               21000 non-null  float64
 8   pH                    21000 non-null  float64
 9   sulphates             21000 non-null  float64
 10  alcohol               21000 non-null  float64
 11  quality               21000 non-null  int64  
dtypes: float64(11), int64(1)
memory usage: 1.9 MB


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,11.6,0.58,0.66,2.2,0.074,10.0,47.0,1.0008,3.25,0.57,9.0,3
1,10.4,0.61,0.49,2.1,0.2,5.0,16.0,0.9994,3.16,0.63,8.4,3
2,7.4,1.185,0.0,4.25,0.097,5.0,14.0,0.9966,3.63,0.54,10.7,3
3,10.4,0.44,0.42,1.5,0.145,34.0,48.0,0.99832,3.38,0.86,9.9,3
4,8.3,1.02,0.02,3.4,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,3


In [185]:
wine_data.describe()
wine_data.drop_duplicates(keep='first', inplace=True)


In [186]:
# Acidity Ratios
wine_data["acidity_ratio"] = wine_data["fixed_acidity"] / wine_data["volatile_acidity"]
wine_data["citric_acid_ratio"] = wine_data["citric_acid"] / wine_data["fixed_acidity"]
wine_data["pH_acidity_interaction"] = wine_data["pH"] * wine_data["fixed_acidity"]

# Sugar and Alcohol Interactions
wine_data["sugar_alcohol_interaction"] = wine_data["residual_sugar"] * wine_data["alcohol"]
wine_data["sugar_sulfite_ratio"] = wine_data["residual_sugar"] / wine_data["total_sulfur_dioxide"]

# Sulfur Dioxide Effects
wine_data["sulfur_dioxide_ratio"] = wine_data["free_sulfur_dioxide"] / wine_data["total_sulfur_dioxide"]
wine_data["sulfur_density_ratio"] = wine_data["total_sulfur_dioxide"] / wine_data["density"]

wine_data = wine_data.replace({'quality' : {
                                    9: 'Good',
                                    8 : 'Good',
                                    7 : 'Good',
                                    6 : 'Middle',
                                    5 : 'Middle',
                                    4 : 'Bad',
                                    3 : 'Bad',}
})
wine_data.head()

my_encoder = LabelEncoder()
wine_data['quality'] = my_encoder.fit_transform(wine_data['quality'])

In [187]:
wine_data.replace([float('inf'), -float('inf')], pd.NA, inplace=True)  # Replace infinite values
wine_data.fillna(0, inplace=True)

wine_data.info()
wine_data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 14940 entries, 0 to 20994
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   fixed_acidity              14940 non-null  float64
 1   volatile_acidity           14940 non-null  float64
 2   citric_acid                14940 non-null  float64
 3   residual_sugar             14940 non-null  float64
 4   chlorides                  14940 non-null  float64
 5   free_sulfur_dioxide        14940 non-null  float64
 6   total_sulfur_dioxide       14940 non-null  float64
 7   density                    14940 non-null  float64
 8   pH                         14940 non-null  float64
 9   sulphates                  14940 non-null  float64
 10  alcohol                    14940 non-null  float64
 11  quality                    14940 non-null  int64  
 12  acidity_ratio              14940 non-null  float64
 13  citric_acid_ratio          14940 non-null  float64


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,acidity_ratio,citric_acid_ratio,pH_acidity_interaction,sugar_alcohol_interaction,sugar_sulfite_ratio,sulfur_dioxide_ratio,sulfur_density_ratio
0,11.6,0.58,0.66,2.2,0.074,10.0,47.0,1.0008,3.25,0.57,9.0,0,20.0,0.056897,37.7,19.8,0.046809,0.212766,46.96243
1,10.4,0.61,0.49,2.1,0.2,5.0,16.0,0.9994,3.16,0.63,8.4,0,17.04918,0.047115,32.864,17.64,0.13125,0.3125,16.009606
2,7.4,1.185,0.0,4.25,0.097,5.0,14.0,0.9966,3.63,0.54,10.7,0,6.244726,0.0,26.862,45.475,0.303571,0.357143,14.047762
3,10.4,0.44,0.42,1.5,0.145,34.0,48.0,0.99832,3.38,0.86,9.9,0,23.636364,0.040385,35.152,14.85,0.03125,0.708333,48.080776
4,8.3,1.02,0.02,3.4,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,0,8.137255,0.00241,28.884,37.4,0.309091,0.545455,11.011893


In [188]:
from sklearn.model_selection import train_test_split

X = wine_data.drop(columns=["quality"])
y = wine_data["quality"]

X_learn, X_test, y_learn, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [189]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_learn = sc.fit_transform(X_learn)
X_test = sc.transform(X_test)

In [190]:
import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device} device")

Using cpu device


In [191]:
class WineClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(18, 256),
            nn.LeakyReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 256),
            nn.LeakyReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 3),
        )
    def forward(self, x):
        return self.linear_relu_stack(x)

model = WineClassifier().to(device)
print(model)

WineClassifier(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=18, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): LeakyReLU(negative_slope=0.01)
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=256, out_features=256, bias=True)
    (7): LeakyReLU(negative_slope=0.01)
    (8): Linear(in_features=256, out_features=3, bias=True)
  )
)


In [192]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

X_test = torch.tensor(X_test.tolist())
y_test = torch.tensor(y_test.tolist())
X_train = torch.tensor(X_learn.tolist())
y_train = torch.tensor(y_learn.tolist())

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [193]:
def test_accuracy():
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device).float(), labels.to(device)
            outputs = model(inputs)
            # Get predictions: the index of the highest probability
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return f"Accuracy on test data: {100 * correct / total:.2f}%"

In [194]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight)  # He initialization for ReLU
        nn.init.zeros_(m.bias)

model.apply(init_weights)

num_epochs = 50
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device).float(), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f} ; " + test_accuracy())

Epoch [1/50], Loss: 0.9662 ; Accuracy on test data: 59.20%
Epoch [2/50], Loss: 0.8239 ; Accuracy on test data: 60.17%
Epoch [3/50], Loss: 0.8173 ; Accuracy on test data: 59.57%
Epoch [4/50], Loss: 0.8110 ; Accuracy on test data: 59.47%
Epoch [5/50], Loss: 0.8068 ; Accuracy on test data: 60.07%
Epoch [6/50], Loss: 0.8024 ; Accuracy on test data: 60.78%
Epoch [7/50], Loss: 0.7968 ; Accuracy on test data: 59.67%
Epoch [8/50], Loss: 0.7973 ; Accuracy on test data: 61.21%
Epoch [9/50], Loss: 0.7961 ; Accuracy on test data: 60.01%
Epoch [10/50], Loss: 0.7926 ; Accuracy on test data: 61.38%
Epoch [11/50], Loss: 0.7917 ; Accuracy on test data: 61.51%
Epoch [12/50], Loss: 0.7904 ; Accuracy on test data: 60.91%
Epoch [13/50], Loss: 0.7903 ; Accuracy on test data: 61.08%
Epoch [14/50], Loss: 0.7881 ; Accuracy on test data: 60.78%
Epoch [15/50], Loss: 0.7869 ; Accuracy on test data: 60.24%
Epoch [16/50], Loss: 0.7852 ; Accuracy on test data: 60.24%
Epoch [17/50], Loss: 0.7853 ; Accuracy on test da