# Red Wine quality prediction by Random Forest

(https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn)

In [3]:
# Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor #Random Forest model family
from sklearn.pipeline import make_pipeline # To make cross vaidation pipeline
from sklearn.model_selection import GridSearchCV # for cross-validation
from sklearn.metrics import mean_squared_error, r2_score # evaluation metrics
import joblib # for storing sklearn models (alternative to pickle package for storing large numpy arrays)


In [15]:
# Loading data
data = pd.read_csv('winequality-red.csv', sep=';')

In [16]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [17]:
data.shape

(1599, 12)

In [18]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [19]:
# Separate target and predictor variables
y = data.quality
X = data.drop('quality', axis=1)


In [20]:
# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)
# stratification by target variable helps in equal distribution of target variable in all sub-groups

In [23]:
# Scaling of data using Transformer API
""" Transformer API helps use the same means and std deviations on both training and test sets and also on any future datasets
thus giving more realistic estimate of model performance and also allows to insert into cross-validation pipeline"""

# Fitting Transformer API to training data (it'll save mean and std)
scaler = preprocessing.StandardScaler().fit(X_train)

# Applying to training data
X_train_scaled = scaler.transform(X_train)
 
X_train_scaled.mean(axis=0) # checking mean is now zero

array([ 1.16664562e-16, -3.05550043e-17, -8.47206937e-17, -2.22218213e-17,
        2.77772766e-18, -6.38877362e-17, -4.16659149e-18, -1.20753377e-13,
       -8.70817622e-16, -4.08325966e-16, -1.16664562e-15])

In [24]:
X_train_scaled.std(axis=0) # checking std is now 1

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [25]:
# Applying Transformer API to test set
X_test_scaled = scaler.transform(X_test)

X_test_scaled.mean(axis=0) # not exactly 0 as mean and std used are from training set

array([ 0.02776704,  0.02592492, -0.03078587, -0.03137977, -0.00471876,
       -0.04413827, -0.02414174, -0.00293273, -0.00467444, -0.10894663,
        0.01043391])

In [27]:
X_test_scaled.std(axis=0) # not exactly 1 as mean and std used are from training set

array([1.02160495, 1.00135689, 0.97456598, 0.91099054, 0.86716698,
       0.94193125, 1.03673213, 1.03145119, 0.95734849, 0.83829505,
       1.0286218 ])

In [28]:
""" In practice, when we set up the cross-validation pipeline, we won't even need to manually fit the Transformer API.
Instead, we'll simply declare the class object, like so:"""

# Pipeline with pre-processing and model
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [30]:
# List of tunable hyperparameters
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler', StandardScaler()),
  ('randomforestregressor', RandomForestRegressor())],
 'verbose': False,
 'standardscaler': StandardScaler(),
 'randomforestregressor': RandomForestRegressor(),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'randomforestregressor__bootstrap': True,
 'randomforestregressor__ccp_alpha': 0.0,
 'randomforestregressor__criterion': 'mse',
 'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'auto',
 'randomforestregressor__max_leaf_nodes': None,
 'randomforestregressor__max_samples': None,
 'randomforestregressor__min_impurity_decrease': 0.0,
 'randomforestregressor__min_impurity_split': None,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__min_weight_fraction_leaf': 0.0,
 'randomforestregressor__n_estimators': 100,
 'randomforestregressor__n_jobs': None,
 'randomforest

In [31]:
# Declare hyperparameters to tune through cross-validation
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

# Note- when it's tuned through a pipeline, you'll need to prepend  randomforestregressor__ before the parameter name
# Also format needs to be in form of dictionary (key-value pairs)

In [32]:
# sklearn cross-validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor())]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [33]:
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'log2'}

In [34]:
# Confirming model is re-trained using best hyperparameters (GridSearchCV automatically does it)
clf.refit

True

In [35]:
# Predicting new set of data (on test data)
y_pred = clf.predict(X_test)

In [36]:
# Model Performance Evaluation
r2_score(y_test, y_pred)

0.4548561050911073

In [37]:
mean_squared_error(y_test, y_pred)

0.35176687500000003

In [38]:
# Saving model for future use (to a .pkl file)
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [39]:
# To load the model again
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)

array([6.46, 5.75, 5.01, 5.52, 6.35, 5.51, 5.04, 4.93, 5.01, 6.05, 5.29,
       5.63, 5.98, 5.08, 5.8 , 5.64, 6.52, 5.75, 5.77, 6.94, 5.33, 5.66,
       5.08, 5.98, 5.96, 5.08, 5.49, 5.16, 5.93, 5.98, 5.82, 6.47, 5.98,
       5.07, 4.92, 5.93, 5.04, 6.08, 4.99, 6.  , 4.87, 5.95, 6.59, 5.15,
       6.2 , 5.36, 5.6 , 5.64, 5.14, 6.46, 6.15, 5.43, 5.8 , 5.23, 5.61,
       5.64, 5.44, 5.37, 4.96, 5.3 , 5.25, 5.1 , 5.05, 5.8 , 6.06, 5.19,
       6.42, 5.05, 5.18, 6.73, 5.84, 5.74, 5.07, 5.04, 5.27, 5.96, 5.31,
       5.18, 5.2 , 5.28, 6.32, 5.53, 6.17, 6.5 , 5.07, 5.95, 6.29, 6.22,
       5.74, 5.82, 5.95, 5.25, 6.29, 5.69, 5.77, 5.83, 6.72, 6.7 , 5.64,
       6.77, 5.06, 5.55, 5.16, 6.41, 5.  , 4.77, 5.75, 5.01, 5.66, 5.98,
       5.83, 5.5 , 5.97, 5.45, 5.16, 5.23, 5.97, 5.11, 5.02, 5.96, 5.87,
       5.09, 5.77, 6.17, 5.28, 5.32, 5.35, 6.03, 5.57, 5.21, 5.82, 6.1 ,
       5.22, 5.19, 5.07, 6.37, 5.03, 5.18, 6.66, 5.44, 5.15, 5.05, 5.63,
       6.06, 5.29, 5.35, 5.13, 6.47, 5.7 , 5.15, 5.