In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler

In [2]:
# for Jupyter-book, we copy data from GitHub, locally, to save Internet traffic,
# you can specify the data/ folder from the root of your cloned
# https://github.com/Yorko/mlcourse.ai repo, to save Internet traffic
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/"

In [3]:
data = pd.read_csv(DATA_PATH + "winequality-white.csv", sep=";")

In [4]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [6]:
#X_train = data.iloc[:3428, :11].values
#y_train = data.iloc[:3428, 11].values

In [7]:
#X_train.shape
# print(x_train)

In [8]:
#y_train.shape
# print(y_train)

In [9]:
#data["quality"].count()

In [10]:
#X_test = data.iloc[3428:, :11].values
#y_test = data.iloc[3428:, 11].values

In [11]:
#X_test.shape

In [12]:
#y_test.shape

In [13]:
#X = data.iloc[:, :11].values
#y = data.iloc[:, 11].values

y = data['quality']
X = data.drop('quality', axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train, y_train)

In [16]:
print(X_train_scaled)

[[-0.99273     0.58375489  2.64116886 ... -0.32100149  0.10300887
  -1.41119853]
 [-0.41141461  0.38747125 -0.84407222 ... -0.05551066 -0.25313273
  -0.51729161]
 [ 0.16990079  0.28932942  1.50643921 ... -0.78561044 -0.52023894
  -0.11097029]
 ...
 [-1.22525616 -0.00509605  0.04750108 ...  1.33831618 -0.87638054
   1.83937207]
 [ 2.72768852 -0.49580516  0.1285532  ... -0.65286502 -1.49962834
  -0.67982014]
 [-1.10899308  0.1911876  -0.27670739 ...  1.47106159 -0.69830974
   1.92063633]]


In [17]:
X_test_scaled = scaler.transform(X_test)

In [18]:
print(X_test_scaled)

[[ 0.40242694 -0.29952151 -0.43881163 ... -0.4537469  -0.96541594
  -1.24867   ]
 [-0.29515153 -0.29952151 -0.11460316 ... -1.51571021  0.28107967
   0.45787957]
 [ 0.16990079 -1.57536521 -0.43881163 ...  2.06841596 -0.43120354
  -0.76108441]
 ...
 [-0.52767769 -0.20137969 -0.7630201  ...  0.20998017  0.01397347
   1.67684354]
 [-2.27162387  0.6328258   0.04750108 ...  0.34272558 -0.78734514
  -0.35476308]
 [-2.27162387  0.53468398 -1.57354128 ...  1.86929784 -0.25313273
  -0.05679411]]


In [19]:
linreg = LinearRegression()
linreg_model = linreg.fit(X_train_scaled, y_train)

In [20]:
linreg_model.score(X_train_scaled, y_train)

0.2904316386901199

# Task 1

mean squared error of the train dataset

In [21]:
y_train_predicted = linreg_model.predict(X_train_scaled)

In [22]:
print(y_train_predicted)

[5.57047067 5.43683851 5.56141314 ... 6.87916139 5.48987436 6.84181167]


In [23]:
mean_squared_error(y_train, y_train_predicted)

0.5580606489803572

In [24]:
linreg_model.score(X_test_scaled, y_test)

0.24987105940272625

In [25]:
y_test_predicted = linreg_model.predict(X_test_scaled)

In [26]:
mean_squared_error(y_test, y_test_predicted)

0.5842473102404545

mean squared error on the test dataset

# Task 2

In [27]:
linreg_model.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False}

In [28]:
linreg_coeffs = pd.DataFrame(linreg.coef_, X.columns, columns=['Coefficients'])

In [29]:
print(linreg_coeffs)

                      Coefficients
fixed acidity             0.097822
volatile acidity         -0.192260
citric acid              -0.000183
residual sugar            0.538164
chlorides                 0.008127
free sulfur dioxide       0.042180
total sulfur dioxide      0.014304
density                  -0.665720
pH                        0.150036
sulphates                 0.062053
alcohol                   0.129533


In [30]:
linreg_coeffs.sort_values(by=['Coefficients'], ascending=False, axis=0)

Unnamed: 0,Coefficients
residual sugar,0.538164
pH,0.150036
alcohol,0.129533
fixed acidity,0.097822
sulphates,0.062053
free sulfur dioxide,0.04218
total sulfur dioxide,0.014304
chlorides,0.008127
citric acid,-0.000183
volatile acidity,-0.19226


density strongly influences the wine quality in negative way. 

alcohol also influences the wine quality in positive way but less than density

# Task 3 

In [31]:
lasso1 = Lasso(alpha=0.01, random_state=17)
lasso1.fit(X_train_scaled, y_train)

Lasso(alpha=0.01, random_state=17)

In [32]:
lasso1.score(X_train_scaled, y_train)

0.28315074467977797

In [33]:
lasso1_coeffs = pd.DataFrame(lasso1.coef_, X.columns, columns=['Coefficients'])

In [34]:
print(lasso1_coeffs)

                      Coefficients
fixed acidity            -0.000000
volatile acidity         -0.188479
citric acid              -0.000000
residual sugar            0.256363
chlorides                -0.002747
free sulfur dioxide       0.043088
total sulfur dioxide     -0.000000
density                  -0.235492
pH                        0.067277
sulphates                 0.029722
alcohol                   0.322425


In [35]:
lasso1_coeffs.sort_values(by=['Coefficients'], ascending=False, axis=0)

Unnamed: 0,Coefficients
alcohol,0.322425
residual sugar,0.256363
pH,0.067277
free sulfur dioxide,0.043088
sulphates,0.029722
fixed acidity,-0.0
citric acid,-0.0
total sulfur dioxide,-0.0
chlorides,-0.002747
volatile acidity,-0.188479


fixed acidity, citric acid and total sulfur dioxide have the least impact on wine quality

In [36]:
alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(alphas=alphas, random_state=17)
lasso_cv.fit(X_train_scaled, y_train)

LassoCV(alphas=array([1.00000000e-06, 1.09698580e-06, 1.20337784e-06, 1.32008840e-06,
       1.44811823e-06, 1.58856513e-06, 1.74263339e-06, 1.91164408e-06,
       2.09704640e-06, 2.30043012e-06, 2.52353917e-06, 2.76828663e-06,
       3.03677112e-06, 3.33129479e-06, 3.65438307e-06, 4.00880633e-06,
       4.39760361e-06, 4.82410870e-06, 5.29197874e-06, 5.80522552e-06,
       6.36824994e-06, 6.98587975e-0...
       1.18953407e+01, 1.30490198e+01, 1.43145894e+01, 1.57029012e+01,
       1.72258597e+01, 1.88965234e+01, 2.07292178e+01, 2.27396575e+01,
       2.49450814e+01, 2.73644000e+01, 3.00183581e+01, 3.29297126e+01,
       3.61234270e+01, 3.96268864e+01, 4.34701316e+01, 4.76861170e+01,
       5.23109931e+01, 5.73844165e+01, 6.29498899e+01, 6.90551352e+01,
       7.57525026e+01, 8.30994195e+01, 9.11588830e+01, 1.00000000e+02]),
        random_state=17)

In [37]:
lasso_cv.alpha_

0.0002833096101839324

In [38]:
lasso_cv_coeffs = pd.DataFrame(lasso_cv.coef_, X.columns, columns=['Coefficients'])
print(lasso_cv_coeffs)

                      Coefficients
fixed acidity             0.093295
volatile acidity         -0.192049
citric acid              -0.000000
residual sugar            0.526883
chlorides                 0.006933
free sulfur dioxide       0.042698
total sulfur dioxide      0.012969
density                  -0.648161
pH                        0.146549
sulphates                 0.060939
alcohol                   0.137115


In [39]:
lasso_cv_coeffs.sort_values(by=['Coefficients'], ascending=False, axis=0)

Unnamed: 0,Coefficients
residual sugar,0.526883
pH,0.146549
alcohol,0.137115
fixed acidity,0.093295
sulphates,0.060939
free sulfur dioxide,0.042698
total sulfur dioxide,0.012969
chlorides,0.006933
citric acid,-0.0
volatile acidity,-0.192049


citric acid is the least informative

# Task 4

In [40]:
y_train_predicted_lasso_cv = lasso_cv.predict(X_train_scaled)

In [41]:
mean_squared_error(y_train, y_train_predicted_lasso_cv)

0.558070014187378

In [42]:
y_test_predicted_lasso_cv = lasso_cv.predict(X_test_scaled)

In [43]:
mean_squared_error(y_test, y_test_predicted_lasso_cv)

0.5832976077860635

# Task 5

In [44]:
forest = RandomForestRegressor(random_state=17)
forest.fit(X_train_scaled, y_train)

RandomForestRegressor(random_state=17)

In [45]:
y_train_predicted_rf = forest.predict(X_train_scaled)

In [46]:
forest.score(X_train_scaled, y_train)

0.9331049541768827

In [47]:
mean_squared_error(y_train_predicted_rf, y_train)

0.05261155192532089

mean squared error on the training dataset

In [48]:
param_grid = {}

cv_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=17), param_grid=param_grid, scoring='neg_mean_squared_error')
cv_rf.fit(X_train_scaled, y_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=17), param_grid={},
             scoring='neg_mean_squared_error')

In [49]:
cv_rf.score(X_train_scaled, y_train)

-0.05261155192532089

In [50]:
y_train_predicted_cv_rf = cv_rf.predict(X_train_scaled)

In [66]:
np.mean(np.abs(cross_val_score(forest, X_train_scaled, y_train, scoring='neg_mean_squared_error')))

0.4142003732204039

mean squared error on the training dataset with cv

In [52]:
y_test_predicted_rf = forest.predict(X_test_scaled)

In [53]:
forest.score(X_test_scaled, y_test)

0.5228454960179294

In [54]:
mean_squared_error(y_test_predicted_rf, y_test)

0.37163775510204083

mean squared error on the test dataset

# Task 6

In [57]:
forest_params = {'max_depth': list(range(10, 25)), 
                 'max_features': list(range(6,12))}

locally_best_forest = GridSearchCV(estimator=RandomForestRegressor(random_state=17, n_jobs=-1),
                                   param_grid=forest_params, scoring='neg_mean_squared_error',
                                   n_jobs=-1, 
                                   cv=5, 
                                   verbose=True)
locally_best_forest.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1, random_state=17),
             n_jobs=-1,
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                       20, 21, 22, 23, 24],
                         'max_features': [6, 7, 8, 9, 10, 11]},
             scoring='neg_mean_squared_error', verbose=True)

In [58]:
locally_best_forest.best_params_, locally_best_forest.best_score_

({'max_depth': 21, 'max_features': 6}, -0.39773288191505934)

In [62]:
print("Mean squared error (cv): %.3f" % np.mean(np.abs(cross_val_score(locally_best_forest.best_estimator_,
                                                        X_train_scaled,
                                                        y_train,
                                                        scoring='neg_mean_squared_error'))))
print("Mean squared error (test): %.3f" % mean_squared_error(locally_best_forest.predict(X_test_scaled), y_test))

Mean squared error (cv): 0.398
Mean squared error (test): 0.366


# Task 7

In [63]:
rf_importance = pd.DataFrame(forest.feature_importances_, X.columns, columns=['Coefficients'])

In [64]:
print(rf_importance)

                      Coefficients
fixed acidity             0.061918
volatile acidity          0.125960
citric acid               0.058904
residual sugar            0.070903
chlorides                 0.063729
free sulfur dioxide       0.116562
total sulfur dioxide      0.067204
density                   0.056540
pH                        0.071950
sulphates                 0.059056
alcohol                   0.247273


In [65]:
rf_importance.sort_values(by=['Coefficients'], ascending=False, axis=0)

Unnamed: 0,Coefficients
alcohol,0.247273
volatile acidity,0.12596
free sulfur dioxide,0.116562
pH,0.07195
residual sugar,0.070903
total sulfur dioxide,0.067204
chlorides,0.063729
fixed acidity,0.061918
sulphates,0.059056
citric acid,0.058904


according to the survey, alcohol is the most important factor

our research shows that random forest is the best model for solving wine quality problem