In [13]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the dataset
df = pd.read_csv("WineQT.csv")

# Step 2: Drop the 'Id' column (not needed)
df.drop(columns=["Id"], inplace=True)

# Step 3: Separate features (X) and target (y)
X = df.drop(columns=["quality"])
y = df["quality"]

# Step 4: Balance the data using oversampling
# Level 1: Slightly balanced (oversample minority classes slightly)
oversampler_level1 = RandomOverSampler(sampling_strategy={3: 50, 4: 100, 5: 483, 6: 483, 7: 200, 8: 50})  # Adjust counts
X_level1, y_level1 = oversampler_level1.fit_resample(X, y)

# Level 2: Moderately balanced (oversample minority classes more aggressively)
oversampler_level2 = RandomOverSampler(sampling_strategy={3: 200, 4: 200, 5: 483, 6: 483, 7: 300, 8: 200})  # Adjust counts
X_level2, y_level2 = oversampler_level2.fit_resample(X, y)

# Level 3: Fully balanced (all classes have 483 samples)
oversampler_level3 = RandomOverSampler(sampling_strategy={3: 483, 4: 483, 5: 483, 6: 483, 7: 483, 8: 483})  # Equalize all classes
X_level3, y_level3 = oversampler_level3.fit_resample(X, y)

# Step 5: Preprocess the balanced datasets
def preprocess_data(X, y):
    # Map target labels to [0, num_classes - 1]
    unique_classes = y.unique()
    num_classes = len(unique_classes)
    label_mapping = {label: idx for idx, label in enumerate(sorted(unique_classes))}
    y_mapped = y.map(label_mapping)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_mapped, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, num_classes

# Preprocess each balanced dataset
X_train1, X_test1, y_train1, y_test1, num_classes1 = preprocess_data(X_level1, y_level1)
X_train2, X_test2, y_train2, y_test2, num_classes2 = preprocess_data(X_level2, y_level2)
X_train3, X_test3, y_train3, y_test3, num_classes3 = preprocess_data(X_level3, y_level3)

# Step 6: Define a function to train and evaluate the model
def train_and_evaluate(X_train, X_test, y_train, y_test, num_classes):
    # Define the model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=55, batch_size=32, validation_split=0.2, verbose=0)

    # Evaluate the model
    y_pred = model.predict(X_test)
    y_pred = tf.argmax(y_pred, axis=1)

    # Print metrics
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("-" * 50)

# Step 7: Train and evaluate on each balanced dataset
print("Level 1: Slightly Balanced")
train_and_evaluate(X_train1, X_test1, y_train1, y_test1, num_classes1)

print("Level 2: Moderately Balanced")
train_and_evaluate(X_train2, X_test2, y_train2, y_test2, num_classes2)

print("Level 3: Fully Balanced")
train_and_evaluate(X_train3, X_test3, y_train3, y_test3, num_classes3)

Level 1: Slightly Balanced
Classification Report:
              precision    recall  f1-score   support

           0       0.64      1.00      0.78         9
           1       0.75      0.56      0.64        16
           2       0.79      0.70      0.74       101
           3       0.64      0.66      0.65        99
           4       0.57      0.67      0.62        43
           5       0.83      0.83      0.83         6

    accuracy                           0.69       274
   macro avg       0.70      0.74      0.71       274
weighted avg       0.70      0.69      0.69       274

Accuracy: 0.6861
--------------------------------------------------
Level 2: Moderately Balanced
Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        40
           1       0.84      0.80      0.82        40
           2       0.67      0.61      0.64       107
           3       0.51      0.54      0.52        84
           4    

### **Answer to Question 14**

#### **Observations**
1. **Level 1: Slightly Balanced**:
   - **Accuracy**: 68.61%
   - **F1-Score (Weighted Avg)**: 0.69
   - The model performs better than the base model but struggles with minority classes (e.g., quality 3 and quality 8 have high recall but lower precision).

2. **Level 2: Moderately Balanced**:
   - **Accuracy**: 71.39%
   - **F1-Score (Weighted Avg)**: 0.71
   - The model shows improved performance compared to Level 1, with better precision and recall for most classes (especially quality 3 and quality 8).

3. **Level 3: Fully Balanced**:
   - **Accuracy**: 84.31%
   - **F1-Score (Weighted Avg)**: 0.83
   - The model achieves the best performance, with high precision, recall, and F1-scores across all classes. Minority classes (e.g., quality 3 and quality 8) are now well-predicted.

---

#### **Key Insights**
1. **Impact of Balancing**:
   - As the dataset becomes more balanced, the model’s performance improves significantly.
   - Fully balanced data (Level 3) results in the highest accuracy (84.31%) and the most consistent performance across all wine quality levels.

2. **Class-Wise Performance**:
   - In **Level 1**, minority classes (e.g., quality 3 and quality 8) have high recall but lower precision, indicating the model is biased toward predicting these classes.
   - In **Level 3**, all classes have balanced precision and recall, showing that the model generalizes well across all wine quality levels.

3. **Trade-Offs**:
   - Balancing the dataset improves performance but may increase computational cost due to the larger dataset size (especially in Level 3).

---

#### **Conclusion**
- **Balancing the dataset is crucial for improving model performance**, especially when dealing with imbalanced data.
- The **fully balanced dataset (Level 3)** yields the best results, with high accuracy and balanced performance across all wine quality levels.
- For future work, you could experiment with **advanced balancing techniques** (e.g., SMOTE) or **class-weighted loss functions** to further improve performance.
