In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
# imoprting data from csv file
df = pd.read_csv('../data/winequality-red.csv', delimiter=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
len(df)

1599

In [4]:
df.quality

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

In [5]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [7]:
df.loc[:, df.columns!='quality']

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [8]:
# train test split. test dataset by 20 percent.
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, df.columns!='quality'], df.quality, test_size=0.20, random_state=42
)

In [9]:
y_train

493     6
354     6
342     6
834     5
705     5
       ..
1130    6
1294    6
860     5
1459    7
1126    6
Name: quality, Length: 1279, dtype: int64

In [10]:
# column transform for increase performance
# numeric_features includes all the features that will used for training
numeric_features = [
    "fixed acidity", "volatile acidity", "citric acid", "residual sugar",
      "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density",
      "pH", "sulphates", "alcohol"
    ]
# numeric_transformer will indicate apply StandardScaler on the numeric_features, this will scale all the numeric values at between 0 and 1
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
    #    ("imputer", SimpleImputer(strategy="mean"))
    ]
)
# preprocessor that indicate apply transform on features.
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

In [11]:
# lasso
# apply preprocessor mentioned in previous part, apply scaling
pipe_lasso = Pipeline([
    ("preprocessor", preprocessor),
    ('regressor', Lasso(tol=0.01))
])
# indicate which parameters to apply and check
param_grid_lasso = {
    'regressor__alpha': [1,5,10,15,20]
}
# now apply preprocessor and parameters, let GridSearchCV apply every possible parameters combination with 10 fold cross-validation.
lasso_grid = GridSearchCV(pipe_lasso, param_grid_lasso, cv=10, verbose=1)
lasso_grid.fit(X_train, y_train)

# Print the best hyperparameters and the accuracy score
print("Best hyperparameters for LASSO: ", lasso_grid.best_params_)
print("Best MSE score for LASSO: ", mean_squared_error(y_train.values.ravel(), lasso_grid.predict(X_train)))
print("Best RMSE score for LASSO: ", mean_squared_error(y_train.values.ravel(), lasso_grid.predict(X_train), squared=False))

Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best hyperparameters for LASSO:  {'regressor__alpha': 1}
Best MSE score for LASSO:  0.6505925698157706
Best RMSE score for LASSO:  0.8065931873105368


In [12]:
# Print the best hyperparameters and the accuracy score for LASSO on test data
print("Best MSE score for LASSO: ", mean_squared_error(y_test.values.ravel(), lasso_grid.predict(X_test)))
print("Best RMSE score for LASSO: ", mean_squared_error(y_test.values.ravel(), lasso_grid.predict(X_test), squared=False))

Best MSE score for LASSO:  0.6571600689645265
Best RMSE score for LASSO:  0.81065409945582


In [13]:
# ridge
pipe_ridge = Pipeline([
    ("preprocessor", preprocessor),
    ('regressor', Ridge())
])

param_grid_ridge = {
    'regressor__alpha': [1,5,10,15,20]
}

ridge_grid = GridSearchCV(pipe_ridge, param_grid_ridge, cv=10, verbose=1)
ridge_grid.fit(X_train, y_train)

# Print the best hyperparameters and the accuracy score
print("Best hyperparameters for Ridge: ", ridge_grid.best_params_)
print("Best MSE score for Ridge: ", mean_squared_error(y_train.values.ravel(), ridge_grid.predict(X_train)))
print("Best RMSE score for Ridge: ", mean_squared_error(y_train.values.ravel(), ridge_grid.predict(X_train), squared=False))

Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best hyperparameters for Ridge:  {'regressor__alpha': 20}
Best MSE score for Ridge:  0.42425396376185603
Best RMSE score for Ridge:  0.6513478055247105


In [14]:
print("Best MSE score for Ridge: ", mean_squared_error(y_test.values.ravel(), ridge_grid.predict(X_test)))
print("Best RMSE score for Ridge: ", mean_squared_error(y_test.values.ravel(), ridge_grid.predict(X_test), squared=False))

Best MSE score for Ridge:  0.3903707095379689
Best RMSE score for Ridge:  0.6247965345118112


In [15]:
# Decision Tree
pipe_dt = Pipeline([
    ("preprocessor", preprocessor),
    ('regressor', DecisionTreeRegressor())
])

## Tune the hyperparameters (depth of tree) of the decision tree classifier
param_grid_dt = {
    'regressor__min_samples_split': [2, 100, 250, 300],
    'regressor__max_depth': [2, 5, 10, 15]
}
dt_grid = GridSearchCV(pipe_dt, param_grid_dt, cv=10, verbose=1)
dt_grid.fit(X_train, y_train.values.ravel())

## Print the best hyperparameters and the accuracy score
print("Best hyperparameters for dt: ", dt_grid.best_params_)
print("Best MSE score for dt: ", mean_squared_error(y_train.values.ravel(), dt_grid.predict(X_train)))
print("Best RMSE score for dt: ", mean_squared_error(y_train.values.ravel(), dt_grid.predict(X_train), squared=False))

Fitting 10 folds for each of 16 candidates, totalling 160 fits
Best hyperparameters for dt:  {'regressor__max_depth': 10, 'regressor__min_samples_split': 100}
Best MSE score for dt:  0.3442557026525823
Best RMSE score for dt:  0.5867330761535285


In [16]:
print("Best MSE score for dt: ", mean_squared_error(y_test.values.ravel(), dt_grid.predict(X_test)))
print("Best RMSE score for dt: ", mean_squared_error(y_test.values.ravel(), dt_grid.predict(X_test), squared=False))

Best MSE score for dt:  0.44629852759060495
Best RMSE score for dt:  0.6680557817956558


In [17]:
## Random Forest
pipe_rf = Pipeline([
    ("preprocessor", preprocessor),
    ('regressor', RandomForestRegressor())
])

param_grid_rf = {
    'regressor__max_depth': [2, 5, 10, 15, None]
}
rf_grid = GridSearchCV(pipe_rf, param_grid_rf, cv=10, verbose=1)
rf_grid.fit(X_train, y_train.values.ravel())

## Print the best hyperparameters and the accuracy score
print("Best hyperparameters for rf: ", rf_grid.best_params_)
print("Best MSE score for rf: ", mean_squared_error(y_train.values.ravel(), rf_grid.predict(X_train)))
print("Best RMSE score for rf: ", mean_squared_error(y_train.values.ravel(), rf_grid.predict(X_train), squared=False))

Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best hyperparameters for rf:  {'regressor__max_depth': 15}
Best MSE score for rf:  0.0539047691976424
Best RMSE score for rf:  0.23217400629192408


In [18]:
print("Best MSE score for rf: ", mean_squared_error(y_test.values.ravel(), rf_grid.predict(X_test)))
print("Best RMSE score for rf: ", mean_squared_error(y_test.values.ravel(), rf_grid.predict(X_test), squared=False))

Best MSE score for rf:  0.309928760624197
Best RMSE score for rf:  0.5567124577591174
