**Ashish Verma** <br>
**University of Ottawa**

## **Loading data in dataframe**

In [None]:
dataPath = '/content/drive/My Drive/Colab Notebooks/MachineLearning/Data'

In [1]:
live_data = 'Restaurant_Scores_-_LIVES_Standard.csv'

In [2]:
import pandas as pd
import numpy as np

In [3]:
load_data = pd.read_csv(live_data)

In [5]:
#sample data view
load_data.head()

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,...,inspection_type,violation_id,violation_description,risk_category,Neighborhoods (old),Police Districts,Supervisor Districts,Fire Prevention Districts,Zip Codes,Analysis Neighborhoods
0,69618,Fancy Wheatfield Bakery,1362 Stockton St,San Francisco,CA,94133,,,,,...,Complaint,69618_20190304_103130,Inadequate sewage or wastewater disposal,Moderate Risk,,,,,,
1,97975,BREADBELLY,1408 Clement St,San Francisco,CA,94118,,,,14157240000.0,...,Routine - Unscheduled,97975_20190725_103124,Inadequately cleaned or sanitized food contact...,Moderate Risk,,,,,,
2,69487,Hakkasan San Francisco,1 Kearny St,San Francisco,CA,94108,,,,,...,Routine - Unscheduled,69487_20180418_103119,Inadequate and inaccessible handwashing facili...,Moderate Risk,,,,,,
3,91044,Chopsticks Restaurant,4615 Mission St,San Francisco,CA,94112,,,,,...,Non-inspection site visit,,,,,,,,,
4,85987,Tselogs,552 Jones St,San Francisco,CA,94102,,,,,...,Routine - Unscheduled,85987_20180412_103132,Improper thawing methods,Moderate Risk,,,,,,


## Data Preprocessing

In [None]:
#displaying number of null values
load_data.apply(lambda x: sum(x.isnull()))

In [None]:
#displaying number of unique values
load_data.apply(lambda x: len(x.unique()))

In [None]:
load_data.business_postal_code.unique().tolist()

In [None]:
#replacing typos to null values as postal code is not known
replace_ca_value = dict.fromkeys(['CA', 'Ca', '941'], np.nan)
load_data = load_data.replace(replace_ca_value)

In [None]:
#making postal code in symmetry with 5 digits only
load_data.business_postal_code = load_data.business_postal_code.str[:5]

In [None]:
#correcting mistyped data
load_data = load_data.replace(['64110'], '94110')

In [None]:
#dropping unwanted columns
dropped_data = load_data.drop('business_name', 1)
dropped_data = dropped_data.drop('business_city', 1)
dropped_data = dropped_data.drop('business_state', 1)
dropped_data = dropped_data.drop('business_location', 1)
dropped_data = dropped_data.drop('business_phone_number', 1)
dropped_data = dropped_data.drop('business_address', 1)
dropped_data = dropped_data.drop('inspection_id', 1)

In [None]:
#changing date to day month and year column
refined_data = dropped_data
refined_data['inspection_date'] = dropped_data['inspection_date'].str.rstrip(' 12:00:00 AM')
refined_data['year'] = pd.DatetimeIndex(refined_data['inspection_date']).year
refined_data['month'] = pd.DatetimeIndex(refined_data['inspection_date']).month
refined_data['day'] = pd.DatetimeIndex(refined_data['inspection_date']).day
refined_data = refined_data.drop('inspection_date', 1)

In [None]:
#extracting unique violation id as violation id is of the form business_id+date+unique_violation_id
refined_data['violation_id'] = refined_data.violation_id.str.split('_').str[2]

In [None]:
refined_data.describe()

In [None]:
##removing rows where min business_postal_code and business_latitude is set to 0
refined_data = refined_data[refined_data.business_postal_code != 0]
refined_data = refined_data[refined_data.business_latitude != 0]

In [None]:
refined_data.inspection_type.unique().tolist()

In [None]:
#dropping rows where inspection_type is null
refined_data=refined_data.dropna(subset=['inspection_type'])

In [None]:
#converting inspection_type to four categories 'Routine', 'Complaint', 'New', 'Investigation'

inspection_type_routine = dict.fromkeys(['Routine - Unscheduled', 'Reinspection/Followup', 'Routine - Scheduled'], 'Routine')
inspection_type_new = dict.fromkeys(['New Construction', 'New Ownership', 'New Ownership - Followup'], 'New')
inspection_type_investigation = dict.fromkeys(['Structural Inspection', 'Non-inspection site visit', 'Foodborne Illness Investigation', 'Special Event', 'Multi-agency Investigation', 'Administrative or Document Review', 'Community Health Assessment'], 'Investigation')
inspection_type_complaint = dict.fromkeys(['Complaint', 'Complaint Reinspection/Followup'], 'Complaint')


In [None]:
refined_data = refined_data.replace(inspection_type_routine)
refined_data = refined_data.replace(inspection_type_new)
refined_data = refined_data.replace(inspection_type_investigation)
refined_data = refined_data.replace(inspection_type_complaint)

In [None]:
refined_data['inspection_type'] = refined_data['inspection_type'].replace({'Routine':1, 'Complaint':2, 'New':3, 'Investigation':4})

In [None]:
print(refined_data['risk_category'].value_counts())

In [None]:
refined_data['risk_category'] = refined_data['risk_category'].replace({'Low Risk':1, 'Moderate Risk':2, 'High Risk':3})

In [None]:
refined_data.risk_category.unique().tolist()

In [None]:
refined_data.violation_description.unique().tolist()

In [None]:
#converting violation type to 4 categories 'Hygiene','Legal','Noncompliance', 'Lack_Infrastructure'
violation_hygiene = dict.fromkeys(['Unclean or degraded floors walls or ceilings', 'Wiping cloths not clean or properly stored or inadequate sanitizer', 'Moderate risk vermin infestation', 'Unclean nonfood contact surfaces', 'Foods not protected from contamination', 'Unclean hands or improper use of gloves', 'High risk vermin infestation', 'Inadequately cleaned or sanitized food contact surfaces', 'Low risk vermin infestation', 'Unclean or unsanitary food contact surfaces', 'Employee eating or smoking', 'Contaminated or adulterated food', 'Unsanitary employee garments hair or nails', 'Other low risk violation', 'Unclean unmaintained or improperly constructed toilet facilities', 'Other moderate risk violation', 'Sewage or wastewater contamination', 'Food in poor condition', 'Other high risk violation', 'Reservice of previously served foods', 'Discharge from employee nose mouth or eye', 'Improperly washed fruits and vegetables'], 'Hygiene')
violation_lack_infra = dict.fromkeys(['Inadequate and inaccessible handwashing facilities', 'Inadequate or unsanitary refuse containers or area or no garbage service', 'No thermometers or uncalibrated thermometers', 'Improper or defective plumbing', 'No hot water or running water', 'Inadequate ventilation or lighting', 'Inadequate warewashing facilities or equipment', 'Inadequate sewage or wastewater disposal', 'Insufficient hot water or running water'],'Lack_Infrastructure')
violation_legal = dict.fromkeys(['Food safety certificate or food handler card not available', 'Unapproved or unmaintained equipment or utensils', 'Permit license or inspection report not posted', 'No plan review or Building Permit', 'Unapproved  living quarters in food facility', 'Unpermitted food facility', 'Unapproved food source', 'Mobile food facility stored in unapproved location', 'Mobile food facility not operating with an approved commissary'],'Legal')
violation_noncompliance = dict.fromkeys(['High risk food holding temperature', 'Inadequate food safety knowledge or lack of certified food safety manager', 'Improper storage of equipment utensils or linens', 'Improper food storage', 'Improper thawing methods', 'Moderate risk food holding temperature', 'Improper cooling methods', 'Improper storage use or identification of toxic substances', 'Improper food labeling or menu misrepresentation', 'Non service animal', 'Noncompliance with shell fish tags or display', 'Noncompliance with HAACP plan or variance', 'Inadequate HACCP plan record keeping', 'Inadequate dressing rooms or improper storage of personal items', 'Improper reheating of food', 'Inadequate procedures or records for time as a public health control', 'Worker safety hazards', 'No person in charge of food facility', 'Improper cooking time or temperatures', 'Unauthorized or unsafe use of time as a public health control measure', 'Consumer advisory not provided for raw or undercooked foods', 'Noncompliance with Gulf Coast oyster regulation', 'Noncompliance with Cottage Food Operation'],'Noncompliance')

In [None]:
refined_data = refined_data.replace(violation_hygiene)
refined_data = refined_data.replace(violation_lack_infra)
refined_data = refined_data.replace(violation_legal)
refined_data = refined_data.replace(violation_noncompliance)

In [None]:
refined_data['violation_description'] = refined_data['violation_description'].replace({'Hygiene':1, 'Legal':2, 'Noncompliance':3, 'Lack_Infrastructure':4})

In [None]:
# refined_data = refined_data.sort_values(by=['year', 'month', 'day'], ascending=True)

In [None]:
refined_data=refined_data.dropna(subset=['business_latitude','violation_description','business_postal_code','inspection_score','Neighborhoods (old)'])

In [None]:
#removing inspection type as it has only 1 unique value
refined_data = refined_data.drop('inspection_type', 1)

In [None]:
#refined_data = pd.get_dummies(refined_data, columns=['violation_description','risk_category'])

In [None]:
#changing data type of postal code to float
refined_data.business_postal_code = refined_data.business_postal_code.astype('float64')
#changing data type of violation_id to float
refined_data.violation_id = refined_data.violation_id.astype('float64')
#changing data type of violation_description to float
refined_data.violation_description = refined_data.violation_description.astype('float64')

In [None]:
# categorical_columns = [x for x in refined_data.dtypes.index]
# for col in categorical_columns:
#    print(refined_data[col].value_counts())

In [None]:
refined_data.apply(lambda x: len(x.unique()))

In [None]:
refined_data.apply(lambda x: sum(x.isnull()))

In [None]:
print(refined_data.info())

In [None]:
refined_data.describe()

In [None]:
refined_data.head(3)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

refined_data['business_postal_code'].value_counts().plot(kind='bar')
plt.title('number of postal code')
plt.xlabel('postal code')
plt.ylabel('count')
sns.despine

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

refined_data['violation_id'].value_counts().plot(kind='bar')
plt.title('number of violation id')
plt.xlabel('violation id')
plt.ylabel('count')
sns.despine

In [None]:
plt.figure(figsize=(10,10))
sns.jointplot(x=refined_data.business_latitude.values, y=refined_data.business_longitude.values, size=10)
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude', fontsize=12)
plt.show()
sns.despine

In [None]:
plt.scatter(refined_data.inspection_score, refined_data.business_longitude)
plt.title('inspection score vs Longitude')

In [None]:
plt.scatter(refined_data.inspection_score, refined_data.business_latitude)
plt.title('inspection score vs Latitude')

In [None]:
plt.scatter(refined_data.business_postal_code, refined_data.inspection_score)
plt.title('inspection score vs postal code')

In [None]:
plt.scatter(refined_data.inspection_score, refined_data.business_id)
plt.title('Business ID vs inspection score')

##Predict label and Training data

In [None]:
predict_label = refined_data['risk_category']
train_data = refined_data.drop('risk_category',1)

Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
featmodel = ExtraTreesClassifier()
featmodel.fit(train_data,predict_label)
print(featmodel.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(featmodel.feature_importances_, index=train_data.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
train_data = train_data.drop(['business_postal_code','year','month'],axis=1)

In [None]:

train_data = train_data.drop(['Fire Prevention Districts'],axis=1)

In [None]:
train_data.to_csv('abc.csv')

##Creating train and test split

In [None]:
#data is shuffled by default before splitting
from sklearn.model_selection import train_test_split

#predict_label = refined_data['inspection_score']

#train_data = refined_data.drop(['business_postal_code', 'inspection_score', 'business_id'],axis=1)
#train_data = refined_data[['business_latitude', 'business_longitude'], axis=1]
x_train , x_test , y_train , y_test = train_test_split(train_data , predict_label , test_size = 0.20, random_state=1)

*Writing data to file*

In [None]:
x_train.to_csv("x_train_modified.csv",index=False)
y_train.to_csv("y_train_modified.csv",index=False)
x_test.to_csv("x_test_modified.csv",index=False)
y_test.to_csv("y_test_modified.csv",index=False)

*Loading data into dataframe from file to continue execution from here each time*

In [None]:
# import pandas as pd
# import numpy as np
# x_train = pd.read_csv('x_train_modified.csv')
# y_train = pd.read_csv('y_train_modified.csv')
# x_test = pd.read_csv('x_test_modified.csv')
# y_test = pd.read_csv('y_test_modified.csv')

## **Helper Methods**

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def displayMetrics(model, y_true, y_pred):
  
  #Accuracy Score
  print('Accuracy: ', accuracy_score(y_true, y_pred))
  
  #Precision Score
  print('Precision Score: ', precision_score(y_true, y_pred, average=None)) 
  
  #Recall Score
  print('Recall: ', recall_score(y_true, y_pred, average=None))
  
  #F1 Score
  print('F1 Score: ', f1_score(y_true, y_pred, average=None))
  

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

def plotRecallPrecision(model, testX, testy):
  # predict probabilities
  probs = model.predict_proba(testX)
  # keep probabilities for the positive outcome only
  probs = probs[:, 1]
  # predict class values
  yhat = model.predict(testX)
  # calculate precision-recall curve
  precision, recall, thresholds = precision_recall_curve(testy, probs)
  # calculate F1 score
  f1 = f1_score(testy, yhat)
  # calculate precision-recall AUC
  auc = auc(recall, precision)
  # calculate average precision score
  ap = average_precision_score(testy, probs)
  print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc, ap))
  # plot no skill
  plt.plot([0, 1], [0.5, 0.5], linestyle='--')
  # plot the precision-recall curve for the model
  plt.plot(recall, precision, marker='.')
  # show the plot
  plt.xlabel('Recall', fontsize=12)
  plt.ylabel('Precision', fontsize=12)
  plt.title('Precision-Recall Curve', fontsize=12)
  plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

def plotRecallPrecisionSVM(model, testX, testy):
  # predict probabilities
  probs = model.decision_function(testX)
  # keep probabilities for the positive outcome only
  probs = probs[:, 1]
  # predict class values
  yhat = model.predict(testX)
  # calculate precision-recall curve
  precision, recall, thresholds = precision_recall_curve(testy, probs)
  # calculate F1 score
  f1 = f1_score(testy, yhat)
  # calculate precision-recall AUC
  auc = auc(recall, precision)
  # calculate average precision score
  ap = average_precision_score(testy, probs)
  print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc, ap))
  # plot no skill
  plt.plot([0, 1], [0.5, 0.5], linestyle='--')
  # plot the precision-recall curve for the model
  plt.plot(recall, precision, marker='.')
  # show the plot
  plt.xlabel('Recall', fontsize=12)
  plt.ylabel('Precision', fontsize=12)
  plt.title('Precision-Recall Curve', fontsize=12)
  plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
def printLinearModels(model, x_train, x_test, y_train, y_test):
  
  y_predicted = model.predict(x_test)
  rmse = mean_squared_error(y_test, y_predicted)
  r2 = r2_score(y_test, y_predicted)
  
  # printing values
#   print('Slope:' ,model.coef_)
#   print('Intercept:', model.intercept_)
  print('Root mean squared error: ', rmse)
  print('R2 score: ', r2)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
def plotConfusionMatrix(y_true, y_pred):
  print(pd.crosstab(y_true, y_pred, rownames=['Actual'], colnames=['Predicted']))
#   print(confusion_matrix(y_true, y_pred))
#   sn.set(font_scale=1.4)#for label size
#   sn.heatmap(df_cm, annot=True,annot_kws={"size": 16})# font size

In [None]:
from sklearn import model_selection
# prepare configuration for cross validation test harness
seed = 7
scoring = 'accuracy'

def modelComparison(models, x_train, y_train):
  results = []
  names = []
  for model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold)
    results.append(cv_results)
    #names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
  # boxplot algorithm comparison
  fig = plt.figure()
  fig.suptitle('Algorithm Comparison')
  ax = fig.add_subplot(111)
  plt.boxplot(results)
  ax.set_xticklabels(names)
  plt.show()

*Cross validation using kfold*

In [None]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

## **Linear Models**

In [None]:
results = []

Basic LinearRegression - Baseline Model 

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train,y_train)
printLinearModels(reg, x_train, x_test, y_train, y_test)

In [None]:
cv_results = model_selection.cross_val_score(reg, x_test, y_test, cv=kfold)
# results.append(cv_results)
msg = "%s: %f (%f)" % ('LinearRegression', cv_results.mean(), cv_results.std())
print(msg)

LogisticRegression - newton-cg

In [None]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression(solver='newton-cg', multi_class='auto', max_iter=1000)
logReg.fit(x_train,y_train)

In [None]:
y_pred = logReg.predict(x_test)
displayMetrics(logReg, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(logReg, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('LogisticRegression-newton-cg', cv_results.mean(), cv_results.std())
print(msg)

LogisticRegression - saga

In [None]:
from sklearn.linear_model import LogisticRegression
logRegSaga = LogisticRegression(solver='saga', multi_class='auto', max_iter=1000)
logRegSaga.fit(x_train,y_train)

In [None]:
y_pred = logRegSaga.predict(x_test)
displayMetrics(logRegSaga, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(logRegSaga, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('LogisticRegression-saga', cv_results.mean(), cv_results.std())
print(msg)

RidgeClassifier

In [None]:
from sklearn.linear_model import RidgeClassifier
ridgeCls = RidgeClassifier()
ridgeCls.fit(x_train, y_train)

In [None]:
y_pred = ridgeCls.predict(x_test)
displayMetrics(ridgeCls, y_test, y_pred)

In [None]:
refined_data.info()

In [None]:
y_train

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(ridgeCls, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('RidgeClassifier', cv_results.mean(), cv_results.std())
print(msg)

SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgdCls = SGDClassifier(max_iter=1000)
sgdCls.fit(x_train, y_train)

In [None]:
y_pred = sgdCls.predict(x_test)
displayMetrics(sgdCls, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(sgdCls, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('SGDClassifier', cv_results.mean(), cv_results.std())
print(msg)

### Comaprison of linear model

In [None]:
names=['LGRN','LGRS','RC','SGD']

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## **Geometric Models**

In [None]:
results = []

*Support Vector Machines*

In [None]:
from sklearn.svm import SVC
svcModel = SVC(gamma='auto', probability=True)
svcModel.fit(x_train,y_train)

In [None]:
y_pred = svcModel.predict(x_test)
displayMetrics(svcModel, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(svcModel, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('SVC', cv_results.mean(), cv_results.std())
print(msg)

*KNearestNeighbours using manhattan distance*

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#manhattan_distance
knnModel = KNeighborsClassifier(p=1, n_neighbors=8)
knnModel.fit(x_train,y_train)

In [None]:
y_pred = knnModel.predict(x_test)
displayMetrics(knnModel, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(knnModel, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('KNN-M', cv_results.mean(), cv_results.std())
print(msg)

*KNearestNeighbours using euclidean distance*

In [None]:
#euclidean_distance
knnModelE = KNeighborsClassifier(p=2, n_neighbors=8)
knnModelE.fit(x_train,y_train)

In [None]:
y_pred = knnModelE.predict(x_test)
displayMetrics(knnModelE, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(knnModelE, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('KNN-E', cv_results.mean(), cv_results.std())
print(msg)

###Comaprison of geometric model

In [None]:
names=['SVC', 'KNN-M','KNN-E']

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## Probabilistic model

In [None]:
results = []

In [None]:
from sklearn.naive_bayes import GaussianNB
gnbModel = GaussianNB()
gnbModel.fit(x_train,y_train)

In [None]:
y_pred = gnbModel.predict(x_test)
displayMetrics(gnbModel, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(gnbModel, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('GNB', cv_results.mean(), cv_results.std())
print(msg)

###Comaprison of Probabilistic model

In [None]:
names=['GNB']

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## **Tree Based Model**

In [None]:
results = []

In [None]:
from sklearn import tree
treeModel = tree.DecisionTreeClassifier(max_depth=8, max_features='auto', min_samples_split = 4)
treeModel = treeModel.fit(x_train,y_train)

In [None]:
y_pred = treeModel.predict(x_test)
displayMetrics(treeModel, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(treeModel, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('Tree', cv_results.mean(), cv_results.std())
print(msg)

In [None]:
import graphviz
dot_data = tree.export_graphviz(treeModel, out_file=None,
                               filled=True, rounded=True,
                               special_characters=True)
graph = graphviz.Source(dot_data)
graph
#graph.render("Restaurant risk indicator")


### Comaprison of Tree based model

In [None]:
names=['tree']

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## **Ensemble**

In [None]:
results = []

*VotingClassifier*

In [None]:
from sklearn import ensemble
votModel = ensemble.VotingClassifier(estimators=[('RC', ridgeCls), ('KNN', knnModel), ('TR', treeModel)])
votModel.fit(x_train, y_train)

In [None]:
y_pred = votModel.predict(x_test)

In [None]:
# printLinearModels(reg, x_train, x_test, y_train, y_test)
displayMetrics(votModel, y_test, y_pred)

In [None]:
printLinearModels(votModel, x_train, x_test, y_train, y_test)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(votModel, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('Voting', cv_results.mean(), cv_results.std())
print(msg)

*BaggingClassifier*

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
baggingModel = BaggingClassifier(KNeighborsClassifier(p=1, n_neighbors=8))
baggingModel.fit(x_train,y_train)

In [None]:
y_pred = baggingModel.predict(x_test)
displayMetrics(baggingModel, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(baggingModel, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('BC', cv_results.mean(), cv_results.std())
print(msg)

*RandomForestClassifier*

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfcModel = RandomForestClassifier(n_estimators=2, max_depth=6)
rfcModel.fit(x_train,y_train)

In [None]:
y_pred = rfcModel.predict(x_test)
displayMetrics(rfcModel, y_test, y_pred)

In [None]:
print('\n Confusion matrix \n')
plotConfusionMatrix(y_test, y_pred)

In [None]:
cv_results = model_selection.cross_val_score(rfcModel, x_test, y_test, cv=kfold)
results.append(cv_results)
msg = "%s: %f (%f)" % ('RFC', cv_results.mean(), cv_results.std())
print(msg)

### Comaprison of ensemble

In [None]:
names=['VOT','BC','RFC']

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## Rule Based Models

*Apriori Algorithm*

In [None]:
pip install apyori

In [None]:
from apyori import apriori

In [None]:
records = [] 
for i in range(0, x_train.shape[0]):  
    records.append([str(x_train.values[i,j]) for j in range(0, 7)])

In [None]:
association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)  
association_results = list(association_rules)  

In [None]:
print(len(association_results))
print(association_results[0])

In [None]:
for item in association_results:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")