In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# problem statment 

# Data Gathering

In [2]:
df = pd.read_csv("WineQT.csv")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2


# Exploratary Data Analysis

In [3]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
dtypes: float64(11)
memory usage: 98.4 KB


# Model Training

In [5]:
x = df.drop("alcohol",axis =1)
y = df["alcohol"]
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size = 0.8, random_state = 5)

# Simple Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()
LR_model.fit(x_train,y_train)

In [7]:
def model_eval(actual, pred):
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    mse = mean_squared_error(actual, pred)
    print(f"MSE : {mse}")
    print(f"RMSE : {np.sqrt(mse)}")
    
    mae = mean_absolute_error(actual, pred)
    print(f"MAE : {mae}")
    
    r2 = r2_score(actual, pred)
    print(f"R2 Score : {r2}")
    return "Accuracy Score"

In [8]:
y_pred_test = LR_model.predict(x_test)
model_eval(y_test, y_pred_test)

MSE : 0.41174154566154225
RMSE : 0.6416709013673149
MAE : 0.4815138013334602
R2 Score : 0.6540363254860526


'Accuracy Score'

In [9]:
y_pred_train = LR_model.predict(x_train)
model_eval(y_train,y_pred_train)

MSE : 0.3569635675452582
RMSE : 0.5974642813970206
MAE : 0.46046762760665977
R2 Score : 0.693579314537567


'Accuracy Score'

# KNN 

In [10]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [11]:
norm = MinMaxScaler()
normalized_x = norm.fit_transform(x)
df_x_norm = pd.DataFrame(normalized_x, columns = x.columns)
df_x_norm

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,0.247788,0.397260,0.00,0.068493,0.106845,0.149254,0.098940,0.567548,0.606299,0.137725
1,0.283186,0.520548,0.00,0.116438,0.143573,0.358209,0.215548,0.494126,0.362205,0.209581
2,0.283186,0.438356,0.04,0.095890,0.133556,0.208955,0.169611,0.508811,0.409449,0.191617
3,0.584071,0.109589,0.56,0.068493,0.105175,0.238806,0.190813,0.582232,0.330709,0.149701
4,0.247788,0.397260,0.00,0.068493,0.106845,0.149254,0.098940,0.567548,0.606299,0.137725
...,...,...,...,...,...,...,...,...,...,...
1138,0.150442,0.267123,0.13,0.095890,0.106845,0.417910,0.120141,0.416300,0.535433,0.251497
1139,0.194690,0.342466,0.08,0.068493,0.093489,0.402985,0.113074,0.472834,0.535433,0.293413
1140,0.141593,0.328767,0.08,0.075342,0.130217,0.462687,0.134276,0.354626,0.559055,0.149701
1141,0.115044,0.294521,0.10,0.089041,0.083472,0.567164,0.159011,0.370778,0.614173,0.257485


# Model Training

In [12]:
x_train,x_test, y_train,y_test = train_test_split(df_x_norm, y, random_state=10, train_size=0.8)
knn = KNeighborsRegressor()
knn.fit(x_train,y_train)

# Testing Data Evaluation

In [13]:
y_pred_test = knn.predict(x_test)
model_eval(y_test, y_pred_test)

MSE : 0.32433551673758393
RMSE : 0.5695046239826187
MAE : 0.42365356622707434
R2 Score : 0.7137949566659998


'Accuracy Score'

In [14]:
y_pred_train = knn.predict(x_train)
model_eval(y_train, y_pred_train)

MSE : 0.2837106248512001
RMSE : 0.5326449331883296
MAE : 0.40068563093172865
R2 Score : 0.7594081990085197


'Accuracy Score'

# Decision Tree

In [15]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [16]:
model = DecisionTreeRegressor(random_state=20)
model.fit(x_train,y_train)

In [17]:
y_pred_test = model.predict(x_test)
model_eval(y_test, y_pred_test)

MSE : 0.49364628820960704
RMSE : 0.7025996642538388
MAE : 0.45065502183406114
R2 Score : 0.564389189534829


'Accuracy Score'

In [None]:
y_pred_train = model.predict(x_train)
model_eval(y_train,y_pred_train)

In [19]:
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
hyp_model = DecisionTreeRegressor()
hyp = {
    'criterion': ['squared_error','absolute_error'], 
    'max_depth' : np.arange(3,8),
    'min_samples_split' :np.arange(5,15),
    'min_samples_leaf' : np.arange(2,8)
}

gscv = GridSearchCV(hyp_model, hyp, cv=5)
gscv.fit(x_train,y_train)

In [None]:
gscv.best_params_