In [109]:
import pandas as pd
data = pd.read_csv(r"C:\Users\USER\.vscode\Python\Data science\wine.csv")


In [110]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [111]:
data.shape

(1599, 12)

In [112]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [113]:
x = data.drop(["quality"],axis = 1)
y = data["quality"]

In [114]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [115]:
y.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [116]:
def mae(x,y,model):
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.8)
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_test,y_pred)
    return mae

# Remember in the context we use scoring in cross validation score below ; Lower score is better

In [117]:
import numpy as np
def score(x,y,model):
    cross = -1 * cross_val_score(model,x,y,
                           cv = 5,
                           scoring = "neg_mean_squared_error")
    return np.mean(cross)

In [118]:
models = [LinearRegression(),RandomForestRegressor(),DecisionTreeRegressor()]
mae_scores = []
cross_scores = []
for model in models:
    mae_score = mae(x,y,model)
    cross_score = score(x,y,model)
    mae_scores.append(mae_score)
    cross_scores.append(cross_score)
print(mae_scores)
print(cross_scores)


[0.5047743204901325, 0.38690625, 0.465625]
[0.4366293880320983, 0.42744501547805636, 0.8499549373040752]


In [119]:
model_names = ["Linear Regression","Random Forest","Decision Tree"]
dta = pd.DataFrame([mae_scores,cross_scores],columns = model_names,index = ["MAE","CROSS"])
dta

Unnamed: 0,Linear Regression,Random Forest,Decision Tree
MAE,0.504774,0.386906,0.465625
CROSS,0.436629,0.427445,0.849955


In this context Random forest performs better in both MAE and Cross validation

In [120]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.8)
model = RandomForestRegressor()
model.fit(x_train,y_train)

In [121]:
model.score(x_test,y_test)

0.5151226731810454

In [122]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
grid = RandomizedSearchCV(RandomForestRegressor(),{
    "n_estimators" : [10,100,1000],
    "criterion" : ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'],
    "min_samples_leaf" : [1,5,10],
    "max_leaf_nodes" : [10,100,200]
},cv = 5,return_train_score = False,n_iter = 2)
grid.fit(x,y)

In [124]:
grid.cv_results_

{'mean_fit_time': array([0.16044788, 0.73367124]),
 'std_fit_time': array([0.05380911, 0.02284134]),
 'mean_score_time': array([0.00394688, 0.01051502]),
 'std_score_time': array([0.00091214, 0.00183306]),
 'param_n_estimators': masked_array(data=[10, 100],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[10, 5],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_max_leaf_nodes': masked_array(data=[200, 10],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_criterion': masked_array(data=['poisson', 'friedman_mse'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 10,
   'min_samples_leaf': 10,
   'max_leaf_nodes': 200,
   'criterion': 'poisson'},
  {'n_estimators': 100,
   'min_samples_leaf': 5,
   'max_leaf_nodes': 10,
   'criterion': 'friedman_mse'

In [127]:
best = pd.DataFrame(grid.cv_results_)[["param_n_estimators","param_min_samples_leaf","param_max_leaf_nodes","param_criterion","mean_test_score"]]
best

Unnamed: 0,param_n_estimators,param_min_samples_leaf,param_max_leaf_nodes,param_criterion,mean_test_score
0,10,10,200,poisson,0.309726
1,100,5,10,friedman_mse,0.301364


In [128]:
model

In [130]:
import pickle
pickle.dump(model,open("model.pkl","wb"))

In [132]:
 data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
