In [2]:
# Data handling and analysis
%pip install seaborn
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Model-related utilities (import from models.py)
from models import load_data, preprocess_data, train_model, save_model, predict


Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


ModuleNotFoundError: No module named 'models'

In [None]:
# Load the data
df = load_data('data/wine_quality.csv')

# Show the first few rows of the dataset
df.head()


In [None]:
# Basic info and statistics
df.info()
df.describe()

# Check for any missing values
df.isnull().sum()


In [None]:
# Distribution of wine quality
plt.figure(figsize=(8, 6))
sns.countplot(x='quality', data=df)
plt.title('Distribution of Wine Quality')
plt.show()


In [None]:
# Correlation matrix
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Pairplot of a few selected features
sns.pairplot(df[['fixed acidity', 'citric acid', 'residual sugar', 'quality']])
plt.suptitle('Pairplot of Selected Features', y=1.02)
plt.show()


In [None]:
# Preprocess the data
X, y = preprocess_data(df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


In [None]:
# Train the model
model = train_model(X_train, y_train)

# Evaluate the model using the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error on Test Set: {mae}')


In [None]:
# Get feature importances
importances = model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Plot the top 10 important features
plt.figure(figsize=(10, 6))
plt.barh(range(10), importances[indices[:10]], align='center')
plt.yticks(range(10), df.columns[indices[:10]])
plt.title('Top 10 Feature Importances')
plt.show()


In [None]:
# Save the trained model
save_model(model)

# Reload the model
loaded_model = load_saved_model('wine_quality_model.pkl')

# Make predictions using the loaded model
predictions = predict(loaded_model, X_test)
print(f"Predictions: {predictions[:10]}")


In [None]:
# Experiment with different hyperparameters
model_tuned = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
model_tuned.fit(X_train, y_train)

# Evaluate the tuned model
y_pred_tuned = model_tuned.predict(X_test)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
print(f'Mean Absolute Error after Hyperparameter Tuning: {mae_tuned}')


In [None]:
# Save the final tuned model
save_model(model_tuned, filename='wine_quality_model_tuned.pkl')
