In [1]:
#importing libraries
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error


In [2]:
# Step 1: Load the Dataset
file_path = r'C:\Users\aryan\wine1\winequality-red.csv'

# Check if the file exists
if not os.path.exists(file_path):
    print(f"File not found at: {file_path}. Please check the file path.")
else:
    print(f"File found: {file_path}")

# Load the dataset
df = pd.read_csv(file_path, sep=',')
print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")


File not found at: C:\Users\aryan\wine1\winequality-red.csv. Please check the file path.


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\aryan\\wine1\\winequality-red.csv'

In [3]:
# Step 2: Check for Missing Values
print("Missing values per column:")
print(df.isnull().sum())


Missing values per column:
type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64


In [4]:
# Step 3: Encode Categorical Data
df['type'] = LabelEncoder().fit_transform(df['type'])
print("Categorical column 'type' encoded successfully.")


Categorical column 'type' encoded successfully.


In [5]:
# Step 4: Handle Missing Values
numeric_imputer = SimpleImputer(strategy='mean')
df[df.columns] = numeric_imputer.fit_transform(df)
print("Numeric columns imputed with mean values.")


Numeric columns imputed with mean values.


In [6]:
# Step 5: Prepare Features (X) and Target (y)
X = df.drop('quality', axis=1)
y = df['quality']
print(f"Number of features in X: {X.shape[1]}")


Number of features in X: 12


In [7]:
# Step 6: Scale the Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [8]:
# Step 7: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [9]:
# Step 8: Train the Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [10]:
# Step 9: Evaluate the Model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))  # Manual RMSE calculation
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Training RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")


Training RMSE: 0.2295
Test RMSE: 0.5619


In [11]:
# Step 10: Save the Model and Scaler
joblib.dump(model, 'wine_quality_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Model and scaler saved successfully.")


Model and scaler saved successfully.


In [8]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the Dataset
file_path = 'winequality-red.csv'  # Update this path with your dataset path

# Load the dataset
df = pd.read_csv(file_path)
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")

# Step 2: Handle Missing Values for Numeric Columns Only
numeric_cols = df.select_dtypes(include=['number']).columns  # Get only numeric columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())  # Fill missing values for numeric columns

# Step 3: Encode Categorical Data
df['type'] = LabelEncoder().fit_transform(df['type'])  # Encode the 'type' column

# Step 4: Prepare Features (X) and Target (y)
X = df.drop('quality', axis=1)  # Drop the target column 'quality' from features
y = df['quality']  # Target is the 'quality' column

# Step 5: Scale the Features
scaler = StandardScaler()  # Initialize StandardScaler
X_scaled = scaler.fit_transform(X)  # Scale the features

# Step 6: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)  # Train-test split

# Define a dictionary of models to test
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=5),
    'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}

# Function to calculate RMSE
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)  # Train the model
    y_train_pred = model.predict(X_train)  # Predict on train data
    y_test_pred = model.predict(X_test)  # Predict on test data
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))  # Calculate RMSE for train set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))  # Calculate RMSE for test set
    
    return train_rmse, test_rmse

# Dictionary to store results
results = {}

# Step 7: Evaluate each model
for model_name, model in models.items():
    train_rmse, test_rmse = evaluate_model(model, X_train, y_train, X_test, y_test)
    results[model_name] = {'Train RMSE': train_rmse, 'Test RMSE': test_rmse}

# Step 8: Display results in a more readable format and sort by Test RMSE
results_df = pd.DataFrame(results).T  # Convert results dictionary to DataFrame

# Sorting the results by Test RMSE in ascending order to compare model performance
results_df = results_df.sort_values(by='Test RMSE', ascending=True)

# Print a more detailed comparison of model performance
print("\nModel Comparison (Sorted by Test RMSE):")
print(results_df)

# Optionally: Save the results to a CSV file for further analysis
results_df.to_csv('model_comparison_results.csv', index=True)


Dataset loaded with 6497 rows and 13 columns.




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1569
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 12
[LightGBM] [Info] Start training from score 5.817779

Model Comparison (Sorted by Test RMSE):
                   Train RMSE  Test RMSE
Random Forest        0.229475   0.561871
LightGBM             0.476253   0.599385
XGBoost              0.279353   0.603564
Gradient Boosting    0.639749   0.642847
MLP                  0.616611   0.650296
AdaBoost             0.713917   0.677228
KNN                  0.569016   0.680633


NameError: name 'LogisticRegression' is not defined