In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler


In [2]:
# for Jupyter-book, we copy data from GitHub, locally, to save Internet traffic,
# you can specify the data/ folder from the root of your cloned
# https://github.com/Yorko/mlcourse.ai repo, to save Internet traffic
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [3]:
data = pd.read_csv(DATA_PATH + "winequality-white.csv", sep=";")


In [4]:
data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [6]:
y = data["quality"]

X = data.drop("quality", axis=1)

X_train_scaled, X_holdout_scaled, y_train, y_holdout = train_test_split(X, y, test_size=0.3, random_state=17)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_holdout_scaled = scaler.transform(X_holdout_scaled)

In [7]:
linear_regression = LinearRegression()
linear_regression.fit(X_train_scaled, y_train)

In [9]:
print("Mean squared error (train): %.3f" % mean_squared_error(y_train, linear_regression.predict(X_train_scaled)))      
print("Mean squared error (test): %.3f" % mean_squared_error(y_holdout, linear_regression.predict(X_holdout_scaled)))

Mean squared error (train): 0.558
Mean squared error (test): 0.584


In [10]:
linear_regression.coef_

array([ 9.78219223e-02, -1.92259947e-01, -1.83224449e-04,  5.38164096e-01,
        8.12724353e-03,  4.21804406e-02,  1.43040227e-02, -6.65720472e-01,
        1.50036006e-01,  6.20533605e-02,  1.29533447e-01])

In [17]:
linear_regression_coefficients = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(linear_regression.coef_)], axis=1),

In [18]:
linear_regression_coefficients

(                       0         0
 0          fixed acidity  0.097822
 1       volatile acidity -0.192260
 2            citric acid -0.000183
 3         residual sugar  0.538164
 4              chlorides  0.008127
 5    free sulfur dioxide  0.042180
 6   total sulfur dioxide  0.014304
 7                density -0.665720
 8                     pH  0.150036
 9              sulphates  0.062053
 10               alcohol  0.129533,)

In [21]:
lasso_regression = Lasso(random_state=17)
lasso_regression.fit(X_train_scaled, y_train)

In [20]:
lasso_coefficients = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(lasso.coef_)], axis=1)
lasso_coefficients

Unnamed: 0,0,0.1
0,fixed acidity,-0.0
1,volatile acidity,-0.0
2,citric acid,-0.0
3,residual sugar,-0.0
4,chlorides,-0.0
5,free sulfur dioxide,0.0
6,total sulfur dioxide,-0.0
7,density,-0.0
8,pH,0.0
9,sulphates,0.0


In [None]:
print("Mean squared error (train): %.3f" % mean_squared_error(y_train, lasso.predict(X_train_scaled)))      
print("Mean squared error (test): %.3f" % mean_squared_error(y_holdout, linear_regression.predict(X_holdout_scaled)))