In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# problem statment 

# Data Gathering

In [2]:
df = pd.read_csv("WineQT.csv")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2


# Exploratary Data Analysis

In [3]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
dtypes: float64(11)
memory usage: 98.4 KB


# Model Training

In [5]:
x = df.drop("alcohol",axis =1)
y = df["alcohol"]
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size = 0.8, random_state = 5)

# Simple Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()
LR_model.fit(x_train,y_train)

In [7]:
def model_eval(actual, pred):
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    mse = mean_squared_error(actual, pred)
    print(f"MSE : {mse}")
    print(f"RMSE : {np.sqrt(mse)}")
    
    mae = mean_absolute_error(actual, pred)
    print(f"MAE : {mae}")
    
    r2 = r2_score(actual, pred)
    print(f"R2 Score : {r2}")
    return "Accuracy Score"

In [8]:
y_pred_test = LR_model.predict(x_test)
model_eval(y_test, y_pred_test)

MSE : 0.41174154566154225
RMSE : 0.6416709013673149
MAE : 0.4815138013334602
R2 Score : 0.6540363254860526


'Accuracy Score'

In [9]:
y_pred_train = LR_model.predict(x_train)
model_eval(y_train,y_pred_train)

MSE : 0.3569635675452582
RMSE : 0.5974642813970206
MAE : 0.46046762760665977
R2 Score : 0.693579314537567


'Accuracy Score'

# KNN 

In [53]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [78]:
norm = MinMaxScaler()
normalized_x = norm.fit_transform(x)
df_x_norm = pd.DataFrame(normalized_x, columns = x.columns)
df_x_norm
norm_x = norm.fit_transform(x_train)

In [79]:
knn_model_hyp = KNeighborsRegressor()
hyp = {"n_neighbors":np.arange(2,20),
       "p":[1,2]}
gscv = GridSearchCV(knn_model_hyp, hyp)
gscv.fit(norm_x, y_train)
gscv.best_estimator_

# Model Training

In [81]:
x_train,x_test, y_train,y_test = train_test_split(df_x_norm, y, random_state=10, train_size=0.8)
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(x_train,y_train)

# Testing Data Evaluation

In [82]:
y_pred_test = knn.predict(x_test)
model_eval(y_test, y_pred_test)

MSE : 0.32937881848377554
RMSE : 0.5739153408681245
MAE : 0.43985236015595763
R2 Score : 0.7093445701978934


'Accuracy Score'

In [83]:
y_pred_train = knn.predict(x_train)
model_eval(y_train, y_pred_train)

MSE : 0.3270035402945432
RMSE : 0.5718422337450629
MAE : 0.43305720537230386
R2 Score : 0.7226950145722699


'Accuracy Score'

# Decision Tree

In [15]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [16]:
model = DecisionTreeRegressor(random_state=20)
model.fit(x_train,y_train)

In [17]:
y_pred_test = model.predict(x_test)
model_eval(y_test, y_pred_test)

MSE : 0.49364628820960704
RMSE : 0.7025996642538388
MAE : 0.45065502183406114
R2 Score : 0.564389189534829


'Accuracy Score'

In [18]:
y_pred_train = model.predict(x_train)
model_eval(y_train,y_pred_train)

MSE : 7.597859287624679e-07
RMSE : 0.0008716570017859479
MAE : 5.470459190373791e-05
R2 Score : 0.9999993556876304


'Accuracy Score'

# Random Forest

In [19]:
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [20]:
model_comparision = {}

In [21]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train,y_train)

In [22]:
y_pred_test = rf_model.predict(x_test)
model_eval(y_test,y_pred_test)

MSE : 0.2459389165499878
RMSE : 0.49592228882153283
MAE : 0.3552874939403749
R2 Score : 0.7829748681959576


'Accuracy Score'

In [23]:
y_pred_train = rf_model.predict(x_train)
model_eval(y_train,y_pred_train)


MSE : 0.041064471788379665
RMSE : 0.20264370651066285
MAE : 0.14736670617832218
R2 Score : 0.9651765765574982


'Accuracy Score'

In [72]:
rf_model_hyp = RandomForestRegressor(random_state=3)
hyp = {'n_estimators':np.arange(10,200,10),
    'criterion':['squared_error','absolute_error'],
    'max_depth':np.arange(5,10),
    'min_samples_split':np.arange(2,10),
    'min_samples_leaf':np.arange(2,10)
}
rscv = RandomizedSearchCV(rf_model_hyp, hyp, cv =4)
rscv.fit(x_train,y_train)
rscv.best_estimator_

In [75]:
rf_tune = RandomForestRegressor(max_depth=9, min_samples_leaf=3, min_samples_split=4,
                      n_estimators=91, random_state=3)
rf_tune.fit(x_train,y_train)

In [76]:
y_pred_test = rf_tune.predict(x_test)
model_eval(y_test,y_pred_test)

MSE : 0.25622898948787215
RMSE : 0.5061906651528376
MAE : 0.37488565517467154
R2 Score : 0.7738945466797666


'Accuracy Score'

In [77]:
y_pred_train = rf_tune.predict(x_train)
model_eval(y_train,y_pred_train)

MSE : 0.110319943208509
RMSE : 0.3321444613545573
MAE : 0.2456762412717235
R2 Score : 0.9064466696101574


'Accuracy Score'

In [85]:
dict1 = {"Test_ACC":{"Simple Linear Regression": 0.65,"KNN":0.70,"Decision Tree":0.56, "Random Forest":0.78,"Random Forest Hyperparameter":0.77 },
            "Train_ACC" :{"Simple Linear Regression": 0.69,"KNN":0.72,"Decision Tree":0.99,"Random Forest":0.96, "Random Forest Hyperparameter": 0.90}}
         
Model_comp_df = pd.DataFrame(dict1)
Model_comp_df.style.format("{:.2%}").background_gradient(cmap='Blues')

Unnamed: 0,Test_ACC,Train_ACC
Simple Linear Regression,65.00%,69.00%
KNN,70.00%,72.00%
Decision Tree,56.00%,99.00%
Random Forest,78.00%,96.00%
Random Forest Hyperparameter,77.00%,90.00%


In [86]:
import pickle
with open("model_pkl.pkl", "wb") as file:
    pickle.dump(rf_tune, file)

In [89]:
with open("model_pkl.pkl", "rb") as file:
    model = pickle.load(file)


In [99]:
model.predict(np.array([0.23,0.18,0.31,0.10,0.10,0.11,0.14,0.45,0.52,0.12],ndmin=2))[0]



10.18439620074527

In [106]:
model.predict(x.iloc[[2]])[0]

10.272043496279753

In [108]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [107]:
x.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65


In [96]:
y.head(50)

0      9.4
1      9.8
2      9.8
3      9.8
4      9.4
5      9.4
6      9.4
7     10.0
8      9.5
9      9.2
10     9.9
11     9.1
12    10.5
13     9.2
14     9.7
15     9.5
16     9.4
17     9.7
18     9.3
19     9.5
20     9.4
21     9.8
22    10.1
23     9.8
24     9.2
25     9.6
26    10.8
27     9.7
28    10.5
29     9.3
30    10.5
31    10.3
32    13.1
33     9.2
34     9.2
35     9.4
36     9.4
37     9.4
38    10.2
39     9.6
40    10.0
41     9.4
42     9.2
43     9.3
44     9.8
45    10.9
46    10.9
47    10.7
48    10.5
49     9.5
Name: alcohol, dtype: float64