In [230]:
import folium
import warnings
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt


from IPython.display import IFrame
from mapping_functions import adding_Marker, geodataframe, chlorepleth_map
from violation_preprocessing import violation_count, violation_separator, violations_dataframe

# for logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

warnings.filterwarnings('ignore')

In [2]:
# import the cleaning dataset 
data = pd.read_csv('data/clean_dataset.csv', delimiter = ',')

# drop the unnamed column
data.drop(['Unnamed: 0'], axis = 1,inplace = True)

# convert the zip column into an str
data.zip = data.zip.astype(str)
    
# reformat the zip code writing in order to compare it with the zip code in geojson file (for vizualisation step)
data['zip'] = data['zip'].apply(lambda x : x.split('.')[0])

# show the dataframe
display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2352734,CHILI'S T-I,CHILI'S (T1-B14),34169.0,Restaurant,Risk 1 (High),11601 W TOUHY AVE,60666,2019-12-04,Canvass,Pass,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,42.008536,-87.914428,"{'latitude': '-87.91442843927047', 'longitude'..."
1,2352727,PORTAGE PARK DAY NURSERY,MOSAIC EARLY CHILDHOOD ACADEMY,2215815.0,Children's Services Facility,Risk 1 (High),5332-5334 W ADDISON ST,60641,2019-12-04,Canvass,Pass,,41.946065,-87.760722,"{'latitude': '-87.76072227616888', 'longitude'..."
2,2352738,AMARIT RESTAURANT,AMARIT RESTAURANT,1801618.0,Restaurant,Risk 1 (High),600 S DEARBORN ST,60605,2019-12-04,Canvass Re-Inspection,Pass,,41.874481,-87.629357,"{'latitude': '-87.62935653990546', 'longitude'..."


In [3]:
# creating our geodataframe based on the basic dataframe
gdf = geodataframe(data)
gdf.head(3)

Unnamed: 0,objectid,shape_area,shape_len,zip,geometry,centroid_lon,centroid_lat,facility_number_per_zip
0,33,106052287.488,42720.0444058,60647,(POLYGON ((-87.67762151065281 41.9177578010629...,-87.702259,41.921098,4628
1,34,127476050.762,48103.7827213,60639,(POLYGON ((-87.72683253163021 41.9226462671259...,-87.755996,41.920456,3436
2,35,45069038.4783,27288.6096123,60707,(POLYGON ((-87.78500237831095 41.9091478547167...,-87.795738,41.919948,735


### Creating a poisoning dataset

In [4]:
# creating a dataset with only the inspections due to poisoning
poisoning_data = data[data['inspection_type'] == 'Suspected Food Poisoning']

# counting the number of fail per zip code
poisoning_count_per_zip = pd.DataFrame(poisoning_data.groupby('zip')['results'].count()).reset_index()
poisoning_count_per_zip.rename(columns = {'results': 'poisoning_inpections_count'}, inplace = True)

In [5]:
# merge with gdf dataframe and facility count dataframe 
poisoning_gdf = pd.merge(poisoning_count_per_zip,gdf, on = 'zip')

# proportion of facility that passed the investigation per zip code
poisoning_gdf['poisoning_proportion'] = poisoning_gdf['poisoning_inpections_count'].divide(poisoning_gdf['facility_number_per_zip'])

### Getting e-coli data

In [6]:
ecoli_df = pd.read_csv('data/beach-e.-coli-predictions.csv',delimiter = ',')

# filter to only keep values over value recommended by US Environmental Protection Agency (USEPA) 
data_filtered = ecoli_df[ecoli_df['Predicted Level'] > 235]

# counting the number of reports of high concentration on each beach
high_ecoli_concentration = pd.DataFrame(data_filtered.groupby('Beach Name')['Predicted Level'].count())
high_ecoli_concentration.reset_index(inplace = True)

In [7]:
# merging the data_filtered dataframe with the high_ecoli_concentration dataframe to have beaches' location
high_ecoli_concentration_location = pd.merge(data_filtered,high_ecoli_concentration,on = 'Beach Name', how = 'inner')

# groupby beaches and creating a dataframe
high_ecoli_concentration_location = high_ecoli_concentration_location.groupby('Beach Name').first()

high_ecoli_concentration_location = pd.DataFrame(high_ecoli_concentration_location)
high_ecoli_concentration_location.rename(columns = {'Predicted Level_y': 'High measure count'}, inplace=True)
high_ecoli_concentration_location.reset_index(inplace = True)
high_ecoli_concentration_location.sort_values(by=['High measure count'], ascending = False)

high_ecoli_concentration_location.head(3)

Unnamed: 0,Beach Name,Date,Prediction Source,Predicted Level_x,RecordID,Latitude,Longitude,Location,High measure count
0,12th Street,2017-06-05T00:00:00,DNA Model,240.8,12thStreet20170605,41.8638,-87.6082,"{'needs_recoding': False, 'longitude': '-87.60...",47
1,57th Street,2017-07-20T00:00:00,DNA Model,312.6,57thStreet20170720,41.7911,-87.5797,"{'needs_recoding': False, 'longitude': '-87.57...",9
2,Foster,2017-05-26T00:00:00,DNA Model,334.7,Foster20170526,41.9785,-87.6515,"{'needs_recoding': False, 'longitude': '-87.65...",49


In [8]:
# map showing e-coli concentration and food poisoning occurences
IFrame(src = 'maps/ecoli_wpoisoning.html', width = 700, height = 600)

# Observational study


> From the map we can see that there are some zip codes that have a high amount of food inspections due to food poisoning and at the same time have high concentration of e-coli. We now want to see what differentiates the facilities that have a lower frequency of food inspections due to food poisoning, but where the concentration of e-coli is still high. 

In [9]:
# creating map with markers to get the zip codes that we want to look at. 
zip_poisoning = chlorepleth_map ('Poisoning proportion',poisoning_gdf,['zip','poisoning_proportion'],'Suspected poisoning proportion','RdPu')

for i in range(len(poisoning_gdf)):
    popup = str(poisoning_gdf['zip'].values[i]) 
    colour = 'purple'
    adding_Marker(zip_poisoning, poisoning_gdf.centroid_lon.values[i], poisoning_gdf.centroid_lat.values[i], popup, colour)

#zip_poisoning.save('zip_poisoning.html')
IFrame(src = 'maps/zip_poisoning.html', width = 700, height = 600)

for comparison we will first look at zip code: 60660 which have a low occurence of inspections due to food poisoning but a high concentration of e-coli and zip-code: 60616 which has a high occurence of inspections due to food poisoning, and a high concentration of e-coli.

What differentiates the two areas?

In [321]:
# getting zip with low occurence of food inspections
zip_60660_low = data[data['zip'] == '60660']

# getting zip with high occurence of food inspections
zip_60616_high = data[data['zip'] == '60616']

# gather all the data in new dataframes
data_zips = dict()

zip_60660_low.head(2)

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
294,2345717,DEVON MARKET,DEVON MARKET,1042888.0,Grocery Store,Risk 1 (High),1440 W DEVON AVE,60660,2019-11-15,Short Form Complaint,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.998228,-87.667104,"{'latitude': '-87.66710446646866', 'longitude'..."
463,2321186,WHITE CASTLE #61,WHITE CASTLE #61,28483.0,Restaurant,Risk 2 (Medium),5940 N RIDGE AVE,60660,2019-11-06,Complaint Re-Inspection,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",41.989677,-87.670207,"{'latitude': '-87.6702068164878', 'longitude':..."


In [322]:
# first look at number of inspections in total
data_zips['#inspections'] = [len(zip_60660_low), len(zip_60616_high)]

# look at number of facilities
data_zips['#facilities'] = [zip_60660_low.license.count(), zip_60616_high.license.count()]

# look at type of facilites with count
fac_60660 = dict(zip_60660_low.facility_type.value_counts())
fac_60616 = dict(zip_60616_high.facility_type.value_counts())
data_zips['facility_types'] = [fac_60660, fac_60616]

# compare risk levels, list with risk levels from high risk (3) to low risk (1)
data_zips['risk_levels'] = [list(zip_60660_low['risk'].value_counts()), list(zip_60616_high['risk'].value_counts())]

# compare results: list with following order: Pass, fail, Pass w/conditions
data_zips['results'] = [list(zip_60660_low['results'].value_counts()), list(zip_60616_high['risk'].value_counts())]

# compare inspection types
data_zips['inspection_types'] = [dict(zip_60660_low['inspection_type'].value_counts()), dict(zip_60616_high['inspection_type'].value_counts())]

# get the amount of the different violations for each zip code
violation_60660 = violation_count(zip_60660_low.violations)
violation_60616 = violation_count(zip_60616_high.violations)
data_zips['violations'] = [violation_60660, violation_60660]

In [437]:
obs_df = pd.DataFrame(data_zips, index = ['60660_low', '60616_high'])
obs_df

Unnamed: 0,#inspections,#facilities,facility_types,risk_levels,results,inspection_types,violations
60660_low,2102,2102,"{'Restaurant': 1540, 'Grocery Store': 224, 'Sc...","[1650, 361, 91]","[1216, 478, 408]","{'Canvass': 1268, 'Canvass Re-Inspection': 401...","{'#5': 164, '#14': 29, '#25': 60, '#51': 56, '..."
60616_high,3425,3425,"{'Restaurant': 2450, 'Grocery Store': 277, 'Sc...","[2623, 733, 69]","[2623, 733, 69]","{'Canvass': 1917, 'Complaint': 504, 'Canvass R...","{'#5': 164, '#14': 29, '#25': 60, '#51': 56, '..."


## Logistic Regression 

In [416]:
# need to have the two datasets 
ml_60660_low = zip_60660_low[['risk', 'zip', 'inspection_type', 'results', 'violations', 'inspection_id', 'license', 'facility_type']]
ml_60616_high = zip_60616_high[['risk', 'zip', 'inspection_type', 'results', 'violations', 'inspection_id', 'license', 'facility_type']]

In [417]:
# setting the target values 
ml_60660_low['e_coli'] = [0]*len(ml_60660_low)
ml_60616_high['e_coli'] = [1]*len(ml_60616_high)

# combining into one single dataset
ml_data = ml_60616_high.append(ml_60660_low)

In [418]:
# get the violations as a truth table 
violation_separated = ml_data.violations.apply(violation_separator).fillna(0)

In [419]:
# we create our dataframe with all the violations counts, we will use the truth-table
violations_thruth, violations_dataframe_df = violations_dataframe(violation_separated, ml_data)

In [420]:
ml_data = pd.concat([violations_thruth, ml_data], axis=1)

In [421]:
def facility_thruth(dataframe):
    '''
    Function that gives representation of the different facility counts for the different facilities. 

    Parameters
    ----------

    dataframe: pandas.Dataframe
        dataframe with facilities

    Returns
    -------
    df_with_fac_count: pandas.Dataframe
        dataframe with facility counts 
    '''            
    
    # putting everything in lowercase for comparison
    dataframe['facility_type'] = dataframe['facility_type'].str.lower()
    
    # getting all the different facility types
    for fac in dataframe.facility_type:
        if type(fac) == str:
            fac = fac.lower()
            if fac not in dataframe.columns:
                dataframe[fac] = [0]*len(dataframe)
    
    # assigning 1 to get which facility type it is
    for fac in dataframe.facility_type:
        if type(fac) == str:
            dataframe.loc[dataframe.facility_type == fac, fac] = 1

    return dataframe


In [422]:
# getting the facility thruth table
ml_data = facility_thruth(ml_data)

In [433]:
ml_data = ml_data.dropna(axis='columns')


In [435]:
# associate a number to each value of the risk factor
risk_mapper = {'Risk 1 (High)': 3,'Risk 2 (Medium)': 2,'Risk 3 (Low)': 1}

# associate a number to each value of the results
result_mapper = {'Pass': 1,'Pass w/ Conditions': 0,'Fail': -1}

# replace them in the dataframe
ml_data['risk'] = ml_data['risk'].replace(risk_mapper).astype(int)
ml_data['results'] = ml_data['results'].replace(result_mapper).astype(int)

# Hot encoding of the 'zip' and 'inspection_type' features
ml_data = pd.get_dummies(ml_data, columns = ['inspection_type', 'zip'])
ml_data.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,inspection_type_Suspected Food Poisoning Re-inspection,inspection_type_Canvass,inspection_type_Canvass Re-Inspection,inspection_type_Complaint,inspection_type_Complaint Re-Inspection,inspection_type_Short Form Complaint,inspection_type_Suspected Food Poisoning,inspection_type_Suspected Food Poisoning Re-inspection.1,zip_60616,zip_60660
604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
652,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,1,0
653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,1,0
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,1,0,0,0,0,0,1,0
740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,1,0


In [436]:
# Make the features and target dataframes and display the features dataframe
target_ml = ml_data.e_coli
features_ml = ml_data.drop(['violations', 'zip_60616', 'zip_60660', 'e_coli', 'inspection_id', 'license', 'facility_type'], axis = 1)
features_ml.head()

KeyError: "['violations' 'zip_60616' 'zip_60660' 'e_coli' 'inspection_id' 'license'\n 'facility_type'] not found in axis"

In [383]:
#x_train, x_test, y_train, y_test = train_test_split(features_ml, target_ml, test_size=0.20, random_state=0)

In [384]:
logisticRegr = LogisticRegression(solver='lbfgs')
#x_train.shape
#y_train.shape

In [385]:
logisticRegr.fit(features_ml, target_ml)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
precision = cross_val_score(logisticRegr, features_ml, target_ml, cv=10, scoring="precision")
recall = cross_val_score(logisticRegr, features_ml, target_ml, cv=10, scoring="recall")

print(precision)
print(recall)

In [None]:
logisticRegr.predict_proba([[0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
#accuracy_per_sample = logisticRegr.predict_proba(features_ml)
mean_accuracy = logisticRegr.score(features_ml, target_ml)

In [None]:
mean_accuracy