In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

# --- Data Loading and Preprocessing ---

# Load the dataset directly from the root directory
try:
    # Corrected file path: removed 'data/'
    wine_df = pd.read_csv('WineQT.csv').drop('Id', axis=1)
except FileNotFoundError:
    print("Dataset file not found. Make sure 'WineQT.csv' is in the project's main directory.")
    # Raise the error to stop the notebook execution cleanly
    raise

# Create a binary classification target variable: 'good' (quality >= 6) or 'bad' (quality < 6)
wine_df['quality_category'] = wine_df['quality'].apply(lambda q: 'good' if q >= 6 else 'bad')

# Drop the original 'quality' column
wine_df = wine_df.drop('quality', axis=1)

# --- Exploratory Data Analysis (EDA) ---
print("First 5 rows of the processed dataset:")
print(wine_df.head())

print("\nDataset Information:")
wine_df.info()

# --- Feature Scaling and Data Splitting ---
X = wine_df.drop('quality_category', axis=1)
y = wine_df['quality_category']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# --- Model Training ---

# Model 1: Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print(f"\nLogistic Regression Cross-Validation Accuracy: {log_reg_scores.mean():.4f}")

# Model 2: Random Forest Classifier
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
rand_forest_scores = cross_val_score(rand_forest, X_train, y_train, cv=5)
print(f"Random Forest Cross-Validation Accuracy: {rand_forest_scores.mean():.4f}")

# --- Model Selection and Saving ---
# Choose Random Forest as the final model
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)

# Save the model and the scaler
with open('model.pkl', 'wb') as model_file:
    pickle.dump(final_model, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("\nModel training complete.")
print("The model has been saved as model.pkl.")
print("The feature scaler has been saved as scaler.pkl.")

First 5 rows of the processed dataset:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol quality_category  
0      9.4       