### set up directories

In [1]:
import os
os.getcwd()
os.chdir('..')
os.getcwd()

'/Users/batemecho/Desktop/Projects/wine snob'

### import libraries and and modules

In [2]:
# importing numpy and pandas
import numpy as np
import pandas as pd

# importing sampling helper
from sklearn.model_selection import train_test_split

# importing preprocessing modules
from sklearn import preprocessing

# importing random forest regressor model
from sklearn.ensemble import RandomForestRegressor

# importing cross-validation pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# importing evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

# importing module for saving model
from sklearn.externals import joblib



### read in the data

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(url, sep = ';')

### inspect the data

In [4]:
display(data.head(5))
display(data.tail(5))

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1594,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
1595,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
1598,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


In [5]:
display(data.shape)
display(data.describe())

(1599, 12)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


### check for missing values

#### if there are, we need to impute after splitting data, so as to not create any data leakage

In [6]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### split data 

In [7]:
y = data['quality']
X = data.drop('quality',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size=0.2,
                                                   random_state = 123,
                                                   stratify = y)

# random forest models introduce randomness by generating many different trees with the idea being that an average
# of many trees might eliminate the bias resulting from doing just one tree

# stratified sampling aims at splitting a data set so that each split is similar with respect to the response variable

### create the pipeline

In [8]:
pipeline = make_pipeline(RandomForestRegressor(n_estimators=1000)) # the number of trees in the forest
# each tree has slightly different data and each tree will give us a different regression model

### tune and fit the model

In [9]:
pipeline.get_params()

{'memory': None,
 'steps': [('randomforestregressor',
   RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                         max_depth=None, max_features='auto', max_leaf_nodes=None,
                         max_samples=None, min_impurity_decrease=0.0,
                         min_impurity_split=None, min_samples_leaf=1,
                         min_samples_split=2, min_weight_fraction_leaf=0.0,
                         n_estimators=1000, n_jobs=None, oob_score=False,
                         random_state=None, verbose=0, warm_start=False))],
 'verbose': False,
 'randomforestregressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                  

In [10]:
hyperparameters = {'randomforestregressor__max_features': ['auto','sqrt','log2'],
                   'randomforestregressor__max_depth'   : [None,5,3,1]}

# max features: the number of features to consider when looking for the best split
# max depth:    the maximum depth of the tree. If None, then nodes are expanded until
#               all leaves are pure or until all leaves contain less than min_samples_split samples

In [11]:
clf = GridSearchCV(pipeline,hyperparameters,cv=10) # perform a grid search using 10-fold cross validation
clf.fit(X_train, y_train)                          # fit the training data
print(clf.best_params_)
assert clf.refit==True                             # assert that the model will be refit using the best params

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


### evaluate model on test data

In [12]:
y_pred = clf.predict(X_test)
print(mean_squared_error(y_test,y_pred))

0.33470691250000006


In [13]:
df_eval = pd.DataFrame({"actual":y_test, "prediction":y_pred})
df_eval['difference'] = df_eval['actual'] - df_eval['prediction']
display(df_eval)

Unnamed: 0,actual,prediction,difference
797,7,6.437,0.563
871,5,5.724,-0.724
1333,5,4.974,0.026
1463,6,5.448,0.552
1058,7,6.357,0.643
...,...,...,...
211,6,5.253,0.747
162,6,5.534,0.466
748,6,5.694,0.306
914,6,6.076,-0.076


### save model

In [15]:
joblib.dump(clf,'output/rf_regressor.pkl')

['output/rf_regressor.pkl']