In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load the dataset
file_path = "F:/wine 1/winequalityN.csv"
wine_data = pd.read_csv(file_path)

# Display the first 5 rows of the dataset
wine_data.head()



Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
# Check for missing values
wine_data.isnull().sum()



type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [4]:
# Define features and target variable
X = wine_data.drop(columns=['quality'])
y = wine_data['quality']


In [5]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['type']),  # One-hot encode the 'type' column
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
            ('scaler', StandardScaler())  # Feature scaling
        ]), X.columns.difference(['type']))  # Apply to all numerical columns except 'type'
    ],
    remainder='passthrough'
)


In [6]:
# Create a pipeline with RandomForestRegressor
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
pipeline_rf.fit(X_train, y_train)


In [19]:
# Make predictions
y_pred_rf = pipeline_rf.predict(X_test)

# Evaluate the model
print("Random Forest Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))


Random Forest Mean Squared Error: 0.3169946153846154


In [None]:
 # Hyperparameter Tuning using RandomizedSearchCV

In [10]:
from sklearn.model_selection import RandomizedSearchCV
# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [100, 150, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

# Initialize RandomizedSearchCV
randomized_search = RandomizedSearchCV(
    pipeline_rf,
    param_distributions=param_grid,
    n_iter=10,  # Number of parameter settings to sample
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1,  # Use all available cores
    verbose=10
)

# Fit RandomizedSearchCV
randomized_search.fit(X_train, y_train)

# Display the best parameters and the corresponding score
print("Best parameters found: ", randomized_search.best_params_)
print("Best MSE: ", -randomized_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'regressor__n_estimators': 200, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_depth': 20}
Best MSE:  0.39791233650195423


In [20]:
from sklearn.metrics import mean_squared_error, r2_score


# Get the best model from RandomizedSearchCV
best_model = randomized_search.best_estimator_

# Make predictions with the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
print("Best Model Mean Squared Error using RandomizedSearchCV:", mean_squared_error(y_test, y_pred_best))

Best Model Mean Squared Error using RandomizedSearchCV: 0.31597839027529306


In [None]:
 # Hyperparameter Tuning using Grid Search

In [14]:
# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [100, 150, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline_rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available cores
    verbose=10
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Display the best parameters and the corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Best MSE: ", -grid_search.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Best MSE:  0.39652459381246763


In [21]:
# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
print("Best Model Mean Squared Error using GridSearch:", mean_squared_error(y_test, y_pred_best))


Best Model Mean Squared Error using GridSearch: 0.315087576923077
