In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import f1_score, accuracy_score  # Added accuracy_score import

# Step 1: Load the dataset
df = pd.read_csv("WineQT.csv")

# Step 2: Drop the 'Id' column (not needed)
df.drop(columns=["Id"], inplace=True)

# Step 3: Separate features (X) and target (y)
X = df.drop(columns=["quality"])  # Features
y = df["quality"]  # Target variable (discrete classes)

# Step 4: Map target labels to a range of [0, num_classes - 1]
unique_classes = y.unique()
num_classes = len(unique_classes)
label_mapping = {label: idx for idx, label in enumerate(sorted(unique_classes))}
y_mapped = y.map(label_mapping)

# Step 5: Perform Z-score normalization (standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Step 6: Split the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y_mapped, test_size=0.2, random_state=42)

# Step 7: Define the base model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer
    Dense(32, activation='relu'),  # Hidden layer
    Dense(16, activation='relu'),  # Hidden layer
    Dense(num_classes, activation='softmax')  # Output layer (num_classes)
])

# Step 8: Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 9: Define a custom callback to track F1-score
class F1ScoreCallback(Callback):
    def __init__(self, X_train, y_train, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.train_f1_scores = []
        self.val_f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        # Predict on training data
        y_train_pred = self.model.predict(self.X_train)
        y_train_pred = tf.argmax(y_train_pred, axis=1).numpy()
        train_f1 = f1_score(self.y_train, y_train_pred, average='weighted')
        self.train_f1_scores.append(train_f1)

        # Predict on validation data
        y_val_pred = self.model.predict(self.X_val)
        y_val_pred = tf.argmax(y_val_pred, axis=1).numpy()
        val_f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.val_f1_scores.append(val_f1)

        # Print F1-scores
        print(f"Epoch {epoch+1}: Train F1-Score = {train_f1:.4f}, Val F1-Score = {val_f1:.4f}")

# Step 10: Split training data into training and validation sets
X_train_train, X_val, y_train_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 11: Initialize the F1-score callback
f1_callback = F1ScoreCallback(X_train_train, y_train_train, X_val, y_val)

# Step 12: Train the model with the F1-score callback
history = model.fit(
    X_train_train, y_train_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[f1_callback],
    verbose=1
)

# Step 13: Evaluate the model on the test set
y_pred = model.predict(X_test)
y_pred = tf.argmax(y_pred, axis=1).numpy()
accuracy = accuracy_score(y_test, y_pred)  # Now this will work
test_f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")

Epoch 1/50
Epoch 1: Train F1-Score = 0.4131, Val F1-Score = 0.3817
Epoch 2/50
Epoch 2: Train F1-Score = 0.4974, Val F1-Score = 0.4269
Epoch 3/50
Epoch 3: Train F1-Score = 0.5218, Val F1-Score = 0.4371
Epoch 4/50
Epoch 4: Train F1-Score = 0.5276, Val F1-Score = 0.4438
Epoch 5/50
Epoch 5: Train F1-Score = 0.5733, Val F1-Score = 0.5326
Epoch 6/50
Epoch 6: Train F1-Score = 0.6108, Val F1-Score = 0.5517
Epoch 7/50
Epoch 7: Train F1-Score = 0.5889, Val F1-Score = 0.5498
Epoch 8/50
Epoch 8: Train F1-Score = 0.6192, Val F1-Score = 0.5569
Epoch 9/50
Epoch 9: Train F1-Score = 0.6177, Val F1-Score = 0.5518
Epoch 10/50
Epoch 10: Train F1-Score = 0.6305, Val F1-Score = 0.5415
Epoch 11/50
Epoch 11: Train F1-Score = 0.6253, Val F1-Score = 0.5451
Epoch 12/50
Epoch 12: Train F1-Score = 0.6230, Val F1-Score = 0.5431
Epoch 13/50
Epoch 13: Train F1-Score = 0.6494, Val F1-Score = 0.5588
Epoch 14/50
Epoch 14: Train F1-Score = 0.6362, Val F1-Score = 0.5478
Epoch 15/50
Epoch 15: Train F1-Score = 0.6422, Val F


### **Answer to Question 13: F1-Score as a New Metric**

The **F1-score** is a valuable metric for evaluating classification models, especially when dealing with imbalanced datasets like the Wine Quality dataset. Unlike accuracy, which only measures the overall correctness of predictions, the F1-score balances **precision** (how many predicted positives are actually correct) and **recall** (how many actual positives are correctly predicted). This makes it particularly useful for identifying performance issues with minority classes, which are often overlooked when using accuracy alone.

#### **Training and Validation F1-Score**
During training, the model’s F1-score on the training data improved significantly, starting at **0.4131** in the first epoch and reaching **0.7651** by the 50th epoch. This indicates that the model is learning and improving its ability to correctly classify the training data. However, the validation F1-score, which measures performance on unseen data, started at **0.3817** and only reached **0.5639** by the final epoch. The gap between the training and validation F1-scores suggests that the model is **overfitting**—it performs well on the training data but struggles to generalize to new data.

#### **Test Set Performance**
On the test set, the model achieved an **accuracy of 57.64%** and an **F1-score of 56.16%**. The slightly lower F1-score compared to accuracy indicates that the model’s performance is not uniform across all classes. Specifically, it struggles with minority classes (e.g., wines with very high or very low quality scores), where precision and recall are lower. This is a common issue in imbalanced datasets, where the model tends to favor the majority classes.

#### **Why F1-Score Matters**
The F1-score provides a more nuanced evaluation of the model’s performance than accuracy. For example:
- A high accuracy but low F1-score suggests the model is good at predicting majority classes but performs poorly on minority classes.
- A balanced F1-score indicates that the model is performing well across all classes, even if the overall accuracy is lower.

#### **Conclusion**
By using the **F1-score** as a metric, we gain a deeper understanding of the model’s performance, particularly for imbalanced datasets. While the model achieves reasonable accuracy, the F1-score reveals areas for improvement, especially in handling minority classes. Addressing these issues through techniques like regularization, class weighting, and hyperparameter tuning can lead to a more robust and balanced model.
