# TEST

In [58]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
from matplotlib import pyplot as plt

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#download the files with we are going to work
#dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
dataset_url = '..\dataset\dataset.csv'
data = pd.read_csv(dataset_url,sep=',' )



In [59]:
data.head()

Unnamed: 0,referenceDate,restaurant,dayOfWeek,menu,hour,occupation
0,2018-03-05,1,5,2,13_3,70
1,2018-03-05,1,5,2,12_3,74
2,2018-03-05,1,5,2,13_2,75
3,2018-03-05,1,5,2,11_3,33
4,2018-03-05,1,5,2,12_2,63


In [60]:
print ('number of rows and columns: {}'.format(data.shape))

number of rows and columns: (5473, 6)


In [62]:
print('Description of data {}' .format(data.describe()))

Description of data        restaurant  dayOfWeek     menu  occupation
count    5473.000   5473.000 5473.000    5473.000
mean        2.309      2.216    3.399      38.601
std         1.052      1.613    1.705      27.440
min         1.000      0.000    1.000       1.000
25%         1.000      1.000    2.000      14.000
50%         2.000      2.000    4.000      33.000
75%         3.000      3.000    5.000      64.000
max         4.000      6.000    6.000     100.000


In [72]:
data.dtypes


referenceDate    datetime64[ns]
restaurant                int64
dayOfWeek                 int64
menu                      int64
hour                     object
occupation                int64
dtype: object

In [76]:
data1 = data.sort_values(['referenceDate','hour','dayOfWeek'])
data1

Unnamed: 0,referenceDate,restaurant,dayOfWeek,menu,hour,occupation
25,2018-03-05,1,0,5,11_0,13
161,2018-03-05,2,0,1,11_0,2
245,2018-03-05,3,0,4,11_0,10
394,2018-03-05,4,0,5,11_0,6
38,2018-03-05,1,1,1,11_0,11
148,2018-03-05,2,1,5,11_0,6
284,2018-03-05,3,1,2,11_0,2
368,2018-03-05,4,1,1,11_0,4
51,2018-03-05,1,2,2,11_0,2
219,2018-03-05,2,2,1,11_0,4


## split data into training and test sets.

In [17]:
y = data.quality
X = data.drop('quality', axis=1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123,stratify=y)


## PreProcessing


In [33]:
X_train_scaled = preprocessing.scale(X_train)
print (X_train_scaled)

[[ 0.51358886  2.19680282 -0.164433   ...  1.08415147 -0.69866131
  -0.58608178]
 [-1.73698885 -0.31792985 -0.82867679 ...  1.46964764  1.2491516
   2.97009781]
 [-0.35201795  0.46443143 -0.47100705 ... -0.13658641 -0.35492962
  -0.20843439]
 ...
 [-0.98679628  1.10708533 -0.93086814 ...  0.24890976 -0.98510439
   0.35803669]
 [-0.69826067  0.46443143 -1.28853787 ...  1.08415147 -0.35492962
  -0.68049363]
 [ 3.1104093  -0.62528606  2.08377675 ... -1.61432173  0.79084268
  -0.39725809]]


In [24]:
scaler  = preprocessing.StandardScaler().fit(X_train)

Now, the scaler object has the saved means and standard deviations for each feature in the training set.

In [30]:
X_train_scaled = scaler.transform(X_train)
print (X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [29]:
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))


[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [34]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

## Declare hyperparameters to tune.

In [35]:

print (pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))], 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=Fals

In [36]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

## Tune model using a cross-validation pipeline.

In [37]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}
