In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("C:/Users/HP/Downloads/RedWine/winequality-red.csv")

print(df.shape)
print(df.columns)
print(df.isnull().sum())


(1599, 12)
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [3]:
print("\nTarget variable: quality")
print("\nMissing values per column:")
print(df.isnull().sum())


Target variable: quality

Missing values per column:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [4]:
X = df.drop(columns=["quality"])
y = df["quality"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train/Test split completed.")
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Train/Test split completed.
Training samples: 1279
Testing samples: 320


In [5]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred1 = lr.predict(X_test)
mse1 = mean_squared_error(y_test, pred1)
r21 = r2_score(y_test, pred1)

print("EXP-01: Linear Regression (No Preprocessing)")
print("MSE:", mse1)
print("R² Score:", r21)


EXP-01: Linear Regression (No Preprocessing)
MSE: 0.3900251439643167
R² Score: 0.4031803412790683


In [6]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LinearRegression())
])

pipe.fit(X_train, y_train)

pred2 = pipe.predict(X_test)
mse2 = mean_squared_error(y_test, pred2)
r22 = r2_score(y_test, pred2)

print("EXP-02: Linear Regression + StandardScaler")
print("MSE:", mse2)
print("R² Score:", r22)


EXP-02: Linear Regression + StandardScaler
MSE: 0.3900251439643171
R² Score: 0.40318034127906766


In [7]:
rf1 = RandomForestRegressor(
    n_estimators=50,
    max_depth=10,
    random_state=42
)

rf1.fit(X_train, y_train)

pred3 = rf1.predict(X_test)
mse3 = mean_squared_error(y_test, pred3)
r23 = r2_score(y_test, pred3)

print("EXP-03: Random Forest (50 trees, depth=10)")
print("MSE:", mse3)
print("R² Score:", r23)


EXP-03: Random Forest (50 trees, depth=10)
MSE: 0.3269784521147545
R² Score: 0.4996549037410771


In [8]:
rf2 = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42
)

rf2.fit(X_train, y_train)

pred4 = rf2.predict(X_test)
mse4 = mean_squared_error(y_test, pred4)
r24 = r2_score(y_test, pred4)

print("EXP-04: Random Forest (100 trees, depth=15)")
print("MSE:", mse4)
print("R² Score:", r24)


EXP-04: Random Forest (100 trees, depth=15)
MSE: 0.3021208068591839
R² Score: 0.5376922754018973


In [9]:
X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(
    X, y, test_size=0.3, random_state=42
)

rf_70 = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42
)

rf_70.fit(X_train_70, y_train_70)

pred_70 = rf_70.predict(X_test_30)

mse_70 = mean_squared_error(y_test_30, pred_70)
r2_70 = r2_score(y_test_30, pred_70)

print("EXP-05: Random Forest (100 trees, depth=15) with 70/30 split")
print("MSE:", mse_70)
print("R² Score:", r2_70)


EXP-05: Random Forest (100 trees, depth=15) with 70/30 split
MSE: 0.34852263619253543
R² Score: 0.45030007476255884
