In [1]:
# Imports - these are all the imports needed for the assignment
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import nltk package 
#   PennTreeBank word tokenizer 
#   English language stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
#   SVM (Support Vector Machine) classifer 
#   Vectorizer, which transforms text data into bag-of-words feature
#   TF-IDF Vectorizer that first removes widely used words in the dataset and then transforms test data
#   Metrics functions to evaluate performance
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/amv036/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/amv036/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1087]:
inspections_filepath = "inspections.csv"
inspections_df = pd.read_csv(inspections_filepath)

restaurants_filepath = "restaurants.csv"
restaurants_df = pd.read_csv(restaurants_filepath, dtype={
    "hsisid": str,
    "postal_code": str,
})

violations_filepath = "violations.csv"
violations_df = pd.read_csv(violations_filepath, dtype={
    "hsisid": str,
})

yelp_filepath = "yelp.csv"
yelp_df = pd.read_csv(yelp_filepath, dtype={
    "zip_code": str
})

zipcodes_filepath = "zipcodes.csv"
zipcodes_df = pd.read_csv(zipcodes_filepath)

In [1088]:
def convert_restaurant_names(name_in):
        
    # Make the input all lowercase
    name_in = name_in.lower()
    
    # Drop all whitespace
    name_in = name_in.strip()
    
    # Chain restaurants have are of format 'chain name #NUMBER'
    # Remove '#NUMBER' is present
    hash_index = name_in.find('#')
    if hash_index != -1:
        name_in = name_in[0:hash_index]
        
    name_in = name_in.replace("i pre-security", "")
    name_in = name_in.replace("&amp;", "and")
    name_in = name_in.replace("&", "and")
        
    if "subway" in name_in:
        name_in = "subway"
    elif "mcdonald's" in name_in or "mcdonalds" in name_in:
        name_in = "mcdonald's"
    elif "starbuck" in name_in:
        name_in = "starbuck"
    elif "dunkin donuts" in name_in:
        name_in = "dunkin donuts"
    elif "arby's" in name_in:
        name_in = "arby's"
    elif "jersey mike" in name_in:
        name_in = "jersey mike"
    elif "wendy's" in name_in:
        name_in = "wendy's"
    elif "jamba j" in name_in:
        name_in = "jamba juice"
    elif "culver" in name_in:
        name_in = "culver's"
    elif "einstein" in name_in and "bagel" in name_in:
        name_in = "einstein bros bagel"
    elif "whataburger" in name_in:
        name_in = "whataburger"
    elif "church's chicken" in name_in:
        name_in = "church's hicken"
    elif "five guys" in name_in:
        name_in = "five guys"
    elif "chick-fil-a" in name_in:
        name_in = "chick-fil-a"
    elif "aubrey" in name_in and "peedi" in name_in and "grill" in name_in:
        name_in = "aubrey and peedies grill"
    elif "skipper's fish fry" in name_in or "skipper`s fish fry and market" == name_in:
        name_in = "skipper's fish fry"
    elif "dean's seafood grill" in name_in:
        name_in = "dean's seafood grill and bar"
    elif "salvio" in name_in and "pizzeria" in name_in:
        name_in = "salvio's pizzeria"
    elif "manchester" in name_in and "grill" in name_in:
        name_in = "manchester bar and grill"
    elif "sam" in name_in and "club" in name_in:
        name_in = "sam's club"
    elif "sami" in name_in and "pizza" in name_in and "more" in name_in:
        name_in = "sami's subs, pizza, and more"
        
    if name_in == "spring cafe 2":
        name_in = "spring cafe"
    
    return name_in.strip()

assert convert_restaurant_names("chick-fil-a #1573") == 'chick-fil-a'
assert convert_restaurant_names("taco bell #22798") == 'taco bell'
assert convert_restaurant_names("chick-fil-a of knightdale") == "chick-fil-a"

In [1209]:
def convert_time(str_in):
    return pd.to_datetime(str_in)

In [1210]:
def convert_zip(zipcode):
    
    # ZIP codes of form "AAAAA-BBBB" are specific version of "AAAAA"
    hyphen_index = zipcode.find('-')
    if hyphen_index != -1:
        zipcode = zipcode[0:hyphen_index]

    zipcode = zipcode.strip()
    return zipcode


assert convert_zip("12345") == "12345"
assert convert_zip("12345-678") == "12345"

In [1211]:
restaurants_df_clean = restaurants_df.drop(labels=[
    "X.objectid",
    "name",
    "state",
    "address1",
    "address2",
    "city",
    "phonenumber",
    "geocodestatus",
    "x",
    "y",
    "facilitytype"
], axis="columns")

restaurants_df_clean.rename(columns={
    "hsisid": "id",
    "postalcode": 'zip',
    "restaurantopendate": "open_date"
}, inplace=True)

restaurants_df_clean = restaurants_df_clean[(restaurants_df["facilitytype"] == "Restaurant") 
                                            & (restaurants_df["facilitytype"] == "Restaurant")]
restaurants_df_clean["name"] = restaurants_df["name"].apply(convert_restaurant_names)
restaurants_df_clean["zip"] = restaurants_df_clean["zip"].apply(convert_zip)
restaurants_df_clean["open_date"] = restaurants_df_clean["open_date"].apply(convert_time)

restaurant_ids = restaurants_df_clean["id"]

assert len(restaurants_df_clean["id"].value_counts().unique()) == 1

In [1212]:
restaurants_df_clean.head()

Unnamed: 0,id,zip,open_date,name
0,4092017230,27616,2016-05-26 00:00:00+00:00,spring cafe
2,4092014444,27587,2005-12-05 00:00:00+00:00,taco bell
3,4092015333,27601,2009-02-04 00:00:00+00:00,the remedy diner
6,4092016679,27601,2014-04-23 00:00:00+00:00,bittersweet
7,4092014493,27617,2006-01-31 00:00:00+00:00,chick-fil-a


* **DATA IS FROM NORTH CAROLINA**
* Only care about restaurants ('facilitytype' in restaurants_df)
* Use 'hsisid' ('id') to ID locations and connect to violations
* Ignore address, name, phone number
* Yelp rating: 1-5 stars
* "Violations" with penalty = 0 are not actually violations but acknowledgements of good practice
* Many restaurants may be at same address
* Assuming yelp "rating" is the average across all reviews (treating it as a quantiative value), rounded to the nearest half

In [1282]:
violations_df_clean = violations_df.drop(labels=[
    "X.objectid",
    "statecode",
    "questionno",
    "violationcode",
    "inspectedby",
    "cdcriskfactor",
    "cdcdataitem",
    "violationtype",
    "observationtype",
    "count",
    "category",
], axis="columns")
    
violations_df_clean.rename(columns={

    "hsisid": "id",
    "pointvalue": "penalty",
    "shortdesc": "description",
    "inspectdate": "inspect_date",
}, inplace=True)

violations_df_clean["inspect_date"] = violations_df_clean["inspect_date"].apply(convert_time)
violations_df_clean.drop_duplicates(inplace=True)

In [1283]:
violations_df_clean.head()

Unnamed: 0,id,inspect_date,critical,severity,description,comments,penalty
0,4092015279,2014-09-22 00:00:00+00:00,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",7-102.11; Priority Foundation - Found unlabele...,0
1,4092014572,2014-09-29 00:00:00+00:00,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",7-102.11; Priority Foundation; One sanitizer b...,0
2,4092015906,2014-10-01 00:00:00+00:00,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",7-102.11; Priority Foundation - Found an unlab...,1
3,4092013840,2014-10-08 00:00:00+00:00,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",7-102.11; Priority Foundation - Found unlabele...,0
4,4092021788,2014-10-09 00:00:00+00:00,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",7-102.11; Priority Foundation - Found one unla...,0


In [1284]:
violations_df_clean["severity"].value_counts()

Core                   111640
Priority Foundation     32366
Priority                29285
Name: severity, dtype: int64

In [1285]:
violations_df_clean["penalty"].value_counts()

0    130724
1     45021
2      4857
3      1020
4         6
Name: penalty, dtype: int64

In [1286]:
violations_df_clean[violations_df_clean["penalty"] == 4]

Unnamed: 0,id,inspect_date,critical,severity,description,comments,penalty
57634,4092010071,2013-03-05 00:00:00+00:00,No,Core,Hands clean & properly washed,2-301.12 (C) TO avoid recontaminating their ha...,4
58414,4092012304,2012-12-06 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,P - 2-301.14 - Observed worker wash his hands ...,4
58478,4092013716,2013-02-21 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14 An employee was observed scrubbing a...,4
67549,4092013070,2013-08-19 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; Employee working the gril...,4
69327,4092021677,2016-02-05 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; Employees must wash hands ...,4
162894,4092014362,2013-10-17 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; The cook handled raw chick...,4


In [1281]:
violations_df_clean[(violations_df_clean["penalty"] == 4)]

Unnamed: 0,id,inspect_date,critical,severity,description,comments,penalty
57634,4092010071,2013-03-05 00:00:00+00:00,No,Core,Hands clean & properly washed,2-301.12 (C) TO avoid recontaminating their ha...,4
58414,4092012304,2012-12-06 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,P - 2-301.14 - Observed worker wash his hands ...,4
58478,4092013716,2013-02-21 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14 An employee was observed scrubbing a...,4
67139,4092012304,2012-12-06 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,P - 2-301.14 - Observed worker wash his hands ...,4
67549,4092013070,2013-08-19 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; Employee working the gril...,4
67550,4092013070,2013-08-19 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; Employee working the gril...,4
69327,4092021677,2016-02-05 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; Employees must wash hands ...,4
78139,4092021677,2016-02-05 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; Employees must wash hands ...,4
162811,4092013716,2013-02-21 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14 An employee was observed scrubbing a...,4
162894,4092014362,2013-10-17 00:00:00+00:00,Yes,Priority,Hands clean & properly washed,2-301.14; Priority; The cook handled raw chick...,4


In [1216]:
restaurants_df_clean.head()

Unnamed: 0,id,zip,open_date,name
0,4092017230,27616,2016-05-26 00:00:00+00:00,spring cafe
2,4092014444,27587,2005-12-05 00:00:00+00:00,taco bell
3,4092015333,27601,2009-02-04 00:00:00+00:00,the remedy diner
6,4092016679,27601,2014-04-23 00:00:00+00:00,bittersweet
7,4092014493,27617,2006-01-31 00:00:00+00:00,chick-fil-a


In [1217]:
yelp_df_clean = yelp_df.drop(labels=[
    "address1",
    "name",
    "price",
    "italian",
    "seafood",
    "newamerican",
    "chicken_wings",
    "delis",
    "bars",
    "salad",
    "seafood",
    "bbq",
    "bakeries",
    "sushi",
    "phone",
    "hotdogs",
    "sandwiches",
    "pizza",
    "tradamerican",
    "burgers",
    "mexican",
    "grocery",
    "breakfast_brunch",
    "coffee",
    "chinese",
    "latitude",
    "longitude",
    "price",
    "id",
    "is_closed",
], axis="columns")


yelp_df_clean.rename(columns={
    "id": "yelp_id",
    "zip_code": 'zip'
}, inplace=True)

yelp_df_clean["name"] = yelp_df["name"].apply(convert_restaurant_names)
yelp_df_clean["zip"] = yelp_df_clean["zip"].apply(convert_zip)

yelp_df_clean = yelp_df_clean[["zip", "name", "rating", "review_count"]]

In [1218]:
yelp_df_clean.head()

Unnamed: 0,zip,name,rating,review_count
0,27560,42nd street oyster bar - rdu airport,2.5,23
1,27519,aldi foods,4.0,17
2,27612,luciano pizzeria ristorante,5.0,2
3,27893,olive garden italian restaurant,3.0,16
4,27896,subway,3.5,3


In [1219]:
merged_df = restaurants_df_clean.merge(yelp_df_clean, how="left", left_on=["zip", "name"], right_on=["zip", "name"])
merged_df.dropna(axis="index", how="any", subset=["rating"], inplace=True)

merged_df = merged_df[["id", "zip", "open_date", "name", "rating", "review_count"]]

In [1220]:
merged_df[merged_df["id"] == "4092015626"]

Unnamed: 0,id,zip,open_date,name,rating,review_count
1516,4092015626,27529,2010-04-27 00:00:00+00:00,mcdonald's,2.0,4.0
1517,4092015626,27529,2010-04-27 00:00:00+00:00,mcdonald's,2.5,3.0
1518,4092015626,27529,2010-04-27 00:00:00+00:00,mcdonald's,3.5,4.0
1519,4092015626,27529,2010-04-27 00:00:00+00:00,mcdonald's,2.0,2.0
1520,4092015626,27529,2010-04-27 00:00:00+00:00,mcdonald's,1.5,9.0


In [1221]:
duplicates = merged_df[merged_df.duplicated(subset=["name", "zip"], keep=False)]
duplicates

Unnamed: 0,id,zip,open_date,name,rating,review_count
16,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,2.0,3.0
17,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.0,1.0
18,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.5,8.0
19,4092014269,27526,2005-02-10 00:00:00+00:00,subway,3.0,1.0
32,4092016069,27502,2011-12-15 00:00:00+00:00,subway,4.0,2.0
...,...,...,...,...,...,...
2428,4092015870,27616,2011-01-20 00:00:00+00:00,mcdonald's,3.0,1.0
2435,4092016557,27587,2013-10-31 00:00:00+00:00,burger king,3.0,3.0
2437,4092017274,27587,2016-06-24 00:00:00+00:00,harris teeter,3.0,10.0
2438,4092017274,27587,2016-06-24 00:00:00+00:00,harris teeter,3.5,9.0


In [1222]:
merged_df[merged_df["id"] == "4092014233"]

Unnamed: 0,id,zip,open_date,name,rating,review_count
16,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,2.0,3.0
17,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.0,1.0
18,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.5,8.0


In [1223]:
g = merged_df.groupby(by="id")
merged_df["weighted_rating"] = merged_df["review_count"] / g["review_count"].transform("sum") * merged_df["rating"]
merged_df.head()

Unnamed: 0,id,zip,open_date,name,rating,review_count,weighted_rating
0,4092017230,27616,2016-05-26 00:00:00+00:00,spring cafe,3.5,35.0,3.5
1,4092014444,27587,2005-12-05 00:00:00+00:00,taco bell,3.0,8.0,3.0
2,4092015333,27601,2009-02-04 00:00:00+00:00,the remedy diner,4.0,347.0,4.0
3,4092016679,27601,2014-04-23 00:00:00+00:00,bittersweet,4.0,150.0,4.0
4,4092014493,27617,2006-01-31 00:00:00+00:00,chick-fil-a,3.5,20.0,3.5


In [1224]:
merged_df[merged_df["id"] == "4092014233"]

Unnamed: 0,id,zip,open_date,name,rating,review_count,weighted_rating
16,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,2.0,3.0,0.5
17,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.0,1.0,0.333333
18,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.5,8.0,3.0


In [1225]:
merged_df['weighted_mean'] = merged_df.groupby(['id'])['weighted_rating'].transform('sum')
merged_df[merged_df["id"] == "4092014233"]

Unnamed: 0,id,zip,open_date,name,rating,review_count,weighted_rating,weighted_mean
16,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,2.0,3.0,0.5,3.833333
17,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.0,1.0,0.333333,3.833333
18,4092014233,27526,2005-01-26 00:00:00+00:00,sheetz,4.5,8.0,3.0,3.833333


In [1228]:
deduped_merged_df = merged_df.drop_duplicates(subset="id")
deduped_merged_df["review_count"] = merged_df.groupby(['id'])["review_count"].transform('sum')
deduped_merged_df[deduped_merged_df["id"] == "4092014233"]
deduped_merged_df.drop(columns=["weighted_rating", "rating"], inplace=True)

deduped_merged_df.head()

Unnamed: 0,id,zip,open_date,name,review_count,weighted_mean
0,4092017230,27616,2016-05-26 00:00:00+00:00,spring cafe,35.0,3.5
1,4092014444,27587,2005-12-05 00:00:00+00:00,taco bell,8.0,3.0
2,4092015333,27601,2009-02-04 00:00:00+00:00,the remedy diner,347.0,4.0
3,4092016679,27601,2014-04-23 00:00:00+00:00,bittersweet,150.0,4.0
4,4092014493,27617,2006-01-31 00:00:00+00:00,chick-fil-a,20.0,3.5


In [1230]:
violations_df_clean.head()

Unnamed: 0,id,inspect_date,category,critical,severity,description,penalty,violationtype,count
0,4092015279,2014-09-22 00:00:00+00:00,Chemical,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",0,R,
1,4092014572,2014-09-29 00:00:00+00:00,Chemical,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",0,CDI,
2,4092015906,2014-10-01 00:00:00+00:00,Chemical,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",1,CDI,
3,4092013840,2014-10-08 00:00:00+00:00,Chemical,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",0,CDI,
4,4092021788,2014-10-09 00:00:00+00:00,Chemical,Yes,Priority Foundation,"Toxic substances properly identified, stored, ...",0,CDI,
