## Directories

In [9]:
import os
os.getcwd()
os.chdir('/Users/Kroki/Desktop/GitHub Projects/Wine Snobs')
os.getcwd()

'/Users/Kroki/Desktop/GitHub Projects/Wine Snobs'

## Importing Libraries and Modules

In [10]:
#Importing numpy and pandas
import numpy as np
import pandas as pd

#Importing sampling helper
from sklearn.model_selection import train_test_split

#Importing preprocessing modules
from sklearn import preprocessing

#Importing random forest model
from sklearn.ensemble import RandomForestRegressor

#Importing cross-validation pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

#Importing evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

#Importing module for saving model
from sklearn.externals import joblib

## Reading in the data

In [11]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep = ';')

## Inspecting the data

In [12]:
data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [13]:
data.tail(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1594,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
1595,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
1598,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


In [14]:
data.shape

(1599, 12)

In [15]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


## Split Data

In [16]:
y = data['quality']
X = data.drop('quality',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size=0.2,
                                                   random_state = 123,
                                                   stratify = y)

## Scale data

In [17]:
pipeline = make_pipeline(preprocessing.StandardScaler(),
                        RandomForestRegressor(n_estimators=100))

## Tune & fit the model

In [18]:
hyperparameters = {'randomforestregressor__max_features':['auto','sqrt','log2'],
                  'randomforestregressor__max_depth':[None,5,3,1]}

clf = GridSearchCV(pipeline,hyperparameters,cv=10)
clf.fit(X_train,y_train)
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


In [19]:
assert clf.refit==True

## Evaluate model on test data

In [20]:
y_pred = clf.predict(X_test)
print(mean_squared_error(y_test,y_pred))

0.3459690625


# Save model, even though it's not great

In [21]:
joblib.dump(clf,'Output/rf_regressor.pkl')

['Output/rf_regressor.pkl']

## Load model from .pkl file and predict data set using loaded model

In [22]:
clf2 = joblib.load('Output/rf_regressor.pkl')
clf2.predict(X_test)

array([6.35, 5.74, 4.96, 5.5 , 6.34, 5.73, 4.94, 4.82, 5.  , 6.05, 5.17,
       5.73, 5.76, 5.07, 5.75, 5.67, 6.57, 5.64, 5.7 , 6.95, 5.54, 5.63,
       5.05, 6.05, 5.87, 5.05, 5.41, 5.14, 5.92, 5.94, 5.87, 6.48, 5.98,
       5.08, 4.92, 5.9 , 5.05, 6.01, 4.95, 6.1 , 4.93, 5.98, 6.74, 5.2 ,
       6.25, 5.42, 5.43, 5.61, 5.1 , 6.41, 5.97, 5.32, 5.75, 5.13, 5.64,
       5.73, 5.39, 5.32, 4.99, 5.33, 5.3 , 5.13, 5.07, 5.86, 5.94, 5.21,
       6.38, 5.01, 5.17, 6.63, 5.68, 5.7 , 5.  , 5.02, 5.29, 6.03, 5.25,
       5.13, 5.14, 5.31, 6.42, 5.59, 6.19, 6.36, 5.08, 6.01, 6.44, 6.42,
       5.76, 5.78, 6.04, 5.38, 6.32, 5.74, 5.71, 5.79, 6.58, 6.72, 5.61,
       6.73, 5.1 , 5.56, 5.17, 6.53, 5.01, 4.71, 5.6 , 5.02, 5.58, 5.8 ,
       5.77, 5.67, 6.09, 5.34, 5.01, 5.18, 5.89, 5.09, 4.91, 6.05, 5.82,
       5.13, 5.87, 6.15, 5.31, 5.38, 5.4 , 5.91, 5.52, 5.44, 5.78, 6.12,
       5.15, 5.37, 5.05, 6.38, 5.01, 5.09, 6.63, 5.56, 5.22, 5.06, 5.6 ,
       6.16, 5.32, 5.4 , 5.11, 6.48, 5.77, 5.07, 5.