In [16]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import re

from scipy import stats
from sklearn.model_selection import train_test_split, validation_curve, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from violation_preprocessing import violation_separator, violations_dataframe

> First, we load the dataset.

In [17]:
#opening the data
data= pd.read_csv('data/clean_dataset.csv',delimiter=',')

# drop the unnamed column
data.drop(['Unnamed: 0'], axis=1,inplace=True)


In [18]:
#show the dataframe
display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2346127,CREPE HOUSE CAFE,CREPE HOUSE CAFE,2637127.0,Restaurant,Risk 1 (High),5033 N ELSTON AVE,60630.0,2019-11-22,Canvass Re-Inspection,Pass,,41.972349,-87.746825,"{'latitude': '-87.74682508578468', 'longitude'..."
1,2346126,LEANS NUTRITION CLUB,LEANS NUTRITION CLUB,2432147.0,Restaurant,Risk 2 (Medium),5302 S PULASKI RD,60632.0,2019-11-22,Canvass Re-Inspection,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.796699,-87.723408,"{'latitude': '-87.72340756196249', 'longitude'..."
2,2346149,PINKS CHILD CARE ACADEMY II,PINK'S CHILD CARE ACADEMY II,2215652.0,Daycare Combo 1586,Risk 1 (High),8000 S KEDZIE AVE,60652.0,2019-11-22,Canvass,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.747934,-87.702489,"{'latitude': '-87.70248876124471', 'longitude'..."


# Violation characterization 

> Next, we split the violations from the comments in the 'violations' column, and represent each violation with a binary value.

**Note**: The two functions below are taken from the class we created *violation_preprocessing*.

In [19]:
# We apply this functions to our basic data 
violation_separated= data.violations.apply(violation_separator).fillna(0)

In [22]:
#we create our dataframe with all the violations counts
violations_dataframe, violations_dataframe02 = Violations_Dataframe(violation_separated,data)

print(print('\n\033[1m Violations per inspection (binary value representation) \033[0m'))
display(violations_dataframe02.head(3))

print(print('\n\033[1m Number of critical ,serious and minor count per inspection \033[0m'))
display(violations_dataframe.head(3))



[1m Violations per inspection (binary value representation) [0m
None


Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,#42,#43,#44,#70,inspection_id,license,zip,risk,results,inspection_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2346127,2637127.0,60630.0,Risk 1 (High),Pass,Canvass Re-Inspection
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2346126,2432147.0,60632.0,Risk 2 (Medium),Pass w/ Conditions,Canvass Re-Inspection
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,2346149,2215652.0,60652.0,Risk 1 (High),Fail,Canvass



[1m Number of critical ,serious and minor count per inspection [0m
None


Unnamed: 0,critical_count,serious_count,minor_count,inspection_id,license,zip,risk,results,inspection_type,violations_count
0,0.0,0.0,0.0,2346127,2637127.0,60630.0,Risk 1 (High),Pass,Canvass Re-Inspection,0.0
1,1.0,0.0,0.0,2346126,2432147.0,60632.0,Risk 2 (Medium),Pass w/ Conditions,Canvass Re-Inspection,2.0
2,0.666667,0.0,0.333333,2346149,2215652.0,60652.0,Risk 1 (High),Fail,Canvass,3.0


# Applied Machine Learning 

> We decided to go for a classification model, since the variables that we want to predict (type of inspection and result) take limited discrete values (e.g. Fail, Pass, Pass w/ Conditions for the variable 'results').

> We tested different classification models in order to obtain the best accuracy. First, we need to optimize the Hyper-Parameter of each model, before computing the performance of the tuned model on the test set. Since these algorithms are going to be used several times, we created functions for the respective optimization and performance testing of each model.

### Ridge Classification 

In [23]:
def Ridge_classification_optimization(X_train, y_train):
    '''
    Optimize the regularization parameter 𝛼 of the Ridge classification model.
    
    Parameters
    -----------
    X_train, y_train: pandas.DataFrame
    
    Returns
    --------
    best_param_val_r: int
        The optimal 𝛼
    '''

    seed=0

    # Define a random classifier pipeline
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('ridge_clf', RidgeClassifier()))
    ridge_pipe = Pipeline(estimators)

    # We use the accuracy obtained on the cross validation folds as a metric
    score = 'accuracy'
    n_splits = 3

    # Set up the cross-validation generator for later use in the 'validation_curve' function
    cv_schema = KFold(n_splits = n_splits, random_state = seed)

    # Tune the model against a single hyper parameter
    tuning_param = 'ridge_clf__alpha'
    tuning_param_range = np.logspace(-4, 5, 10)

    # Compute training and test scores for varying parameter values using 'validation_curve' 
    train_scores_val, cv_scores_val = validation_curve(
        ridge_pipe, X_train, y_train, param_name = tuning_param, param_range = tuning_param_range,
        cv = cv_schema, scoring = score, n_jobs = -1)

    # Obtain the best value of the hyper parameter, and the highest accuracy score
    best_param_val_r = tuning_param_range[np.argmax(np.mean(cv_scores_val, axis=1))]
    best_ridge_acc = max(np.mean(cv_scores_val, axis=1))
    #print('Best alpha : ' + str(best_param_val_r))
    #print('Best ridge accuracy : ' + str(best_ridge_acc))

    return best_param_val_r , ridge_pipe

In [24]:
def Ridge_classification(best_param_val_r, X_train, y_train, X_test, y_test,ridge_pipe):
    '''
    Create the model for ridge classification with the best parameter 𝛼.
    Compute the performance of the tuned model on test set.
    
    Parameters
    -----------
    best_param_val_r: float
        The optimal 𝛼
        
    X_train, y_train, X_test, y_test: pandas.DataFrame
        Training and test sets
    
    Returns
    -------
    null
    '''
    ridge_pipe.set_params(ridge_clf__alpha = best_param_val_r)
    ridge_pipe.fit(X_train,y_train)
    y_pred = ridge_pipe.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print ('Ridge classifer accuracy = %2.4f' %score)

### Random Forest

In [37]:
def Rf_optimization(X_train, y_train):
    '''
    Optimize the regularization parameter of the Random Forest classification model.
    
    Parameters
    -----------
    X_train, y_train: pandas.DataFrame
    
    Returns
    --------
    best_param_val_rf: int
        The optimal n_estimators
    '''
    seed = 1

    # Define a random classifier pipeline
    estimators = []
    estimators.append(('rf_clf', RandomForestClassifier()))
    pipeline = Pipeline(estimators)

    # Fixed parameters
    score = 'accuracy'
    tuning_param_range = [int(i) for i in np.linspace(10, 100, 10)]
    tuning_param = 'rf_clf__n_estimators'
    cv_schema = KFold(n_splits = 5, random_state = seed)

    # Tune hyper parameter using validation curve
    train_scores_val, cv_scores_val = validation_curve(
        pipeline, X_train, y_train, param_name = tuning_param, param_range = tuning_param_range,
        cv = cv_schema, scoring = score, n_jobs = -1)

    # Obtain the best value of the hyper parameter, with the best accuracy score
    best_param_val_rf = tuning_param_range[np.argmax(np.mean(cv_scores_val, axis=1))]
    best_rf_acc = max(np.mean(cv_scores_val, axis=1))

    #print ('best n_estimators = %d with accuracy score = %2.4f' %(best_param_val_rf, best_rf_acc))
    
    return best_param_val_rf,pipeline

In [38]:
def Rf_classification(best_param_val_rf, X_train, y_train, X_test, y_test,pipeline):
    '''
    Create the model for Random Forest classification with the best parameter value.
    Compute the performance of the tuned model on the test set.
    
    Parameters
    -----------
    best_param_val_rf: int
        The best parameter value
        
    X_train, y_train, X_test, y_test: pandas.DataFrame
        Training and test sets
    
    Returns
    -------
    null
    '''
    pipeline.set_params(rf_clf__n_estimators = best_param_val_rf)
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print ('rf classifer accuracy = %2.4f' %score)

### Gradient Boosted Trees

In [42]:
def GBT_optimization(X_train, y_train):
    '''
    Optimize the regularization parameter of the Gradient Boosted Trees classification model.
    
    Parameters
    -----------
    X_train, y_train: pandas.DataFrame
    
    Returns
    --------
    best_param_val_GBT: int
        The optimal Hyper-parameter
    '''

    estimators = []
    estimators.append(('XGB_clf', XGBClassifier()))
    gb_pipe = Pipeline(estimators)

    # Fixed parameters
    score = 'accuracy'
    n_splits = 3
    seed=0
    
    # CV schema
    cv_schema = KFold(n_splits = n_splits, random_state = seed)

    # Tune model against a single hyper parameter
    tuning_param = 'XGB_clf__n_estimators'
    tuning_param_range = [int(i) for i in np.linspace(10.0, 30.0, 5)]

    # Tune hyper parameter using validation curve
    train_scores_val, cv_scores_val = validation_curve(
        gb_pipe, X_train, y_train, param_name = tuning_param, param_range = tuning_param_range,
        cv = cv_schema, scoring = score, n_jobs = -1)

    # Obtain the best value of the hyper parameter
    best_param_val_GBT = tuning_param_range[np.argmax(np.mean(cv_scores_val, axis=1))]
    best_xgb_acc = max(np.mean(cv_scores_val, axis=1))
    #print(best_param_val_GBT)
    #print(best_xgb_acc)
    
    return best_param_val_GBT 

In [43]:
def GBT_classification(best_param_val_GBT, X_train, y_train, X_test, y_test):
    '''
    Create the model for Gradient Boosted Trees classification with the best parameter value.
    Compute the performance of the tuned model on the test set.
    
    Parameters
    -----------
    best_param_val_GBT: int
        The best parameter value
        
    X_train, y_train, X_test, y_test: pandas.DataFrame
        Training and test sets
    
    Returns
    -------
    null
    '''
    optimal_xgb =  XGBClassifier(n_estimators=best_param_val_GBT)
    optimal_xgb.fit(X_train,y_train)
    xgb_pred = optimal_xgb.predict(X_test)
    acc_xgb = accuracy_score(y_test, xgb_pred)

    print("Gradient Boosted trees model score : " + str(acc_xgb))

## 1) Predecting the inspection result

> In this section, we provide a proof of concept for a machine learning model that can predict the inspection outcome (result) for a given food facility.

### Preprocessing

> The features that we decided to use are the following; we can safely discard the rest of the columns.

In [29]:
#features and target variable
restaurant_data = ['results', 'inspection_type',
       'violations_count', 'zip', 'risk', 'critical_count',
       'serious_count', 'minor_count']

> We start by creating a dataframe containing the columns of interest.

In [30]:
df_01 = violations_dataframe[restaurant_data]
df_01.head()

Unnamed: 0,results,inspection_type,violations_count,zip,risk,critical_count,serious_count,minor_count
0,Pass,Canvass Re-Inspection,0.0,60630.0,Risk 1 (High),0.0,0.0,0.0
1,Pass w/ Conditions,Canvass Re-Inspection,2.0,60632.0,Risk 2 (Medium),1.0,0.0,0.0
2,Fail,Canvass,3.0,60652.0,Risk 1 (High),0.666667,0.0,0.333333
3,Pass w/ Conditions,Canvass,0.0,60623.0,Risk 1 (High),0.0,0.0,0.0
4,Pass w/ Conditions,Canvass,1.0,60625.0,Risk 1 (High),1.0,0.0,0.0


> Since our final goal will be to train a linear machine learning model, all of the features we use need to be numerical. Some features like the 'inspection_type' and 'risk' are non-numeric and non-atomic. We transform these features into numerical features. As the risk feature is ordinal (from low to high), we choose to encode it accordingly instead of using one hot encoding (get_dummies). We do the same for the inspection result.

In [31]:
# Associate a number to each value of the risk factor
risk_mapper = {'Risk 1 (High)':3,'Risk 2 (Medium)':2,'Risk 3 (Low)':1}

# Associate a number to each value of the results
result_mapper = {'Pass':1,'Pass w/ Conditions':0,'Fail':-1}

# Replace them in the dataframe
df_01['risk'] = df_01['risk'].replace(risk_mapper).astype(int)
df_01['results'] = df_01['results'].replace(result_mapper).astype(int)

df_01.head()

Unnamed: 0,results,inspection_type,violations_count,zip,risk,critical_count,serious_count,minor_count
0,1,Canvass Re-Inspection,0.0,60630.0,3,0.0,0.0,0.0
1,0,Canvass Re-Inspection,2.0,60632.0,2,1.0,0.0,0.0
2,-1,Canvass,3.0,60652.0,3,0.666667,0.0,0.333333
3,0,Canvass,0.0,60623.0,3,0.0,0.0,0.0
4,0,Canvass,1.0,60625.0,3,1.0,0.0,0.0


> We encode the other features using 'hot encoding' or get_dummies.

In [32]:
# Hot encoding of the 'zip' and 'inspection_type' features
df_01 = pd.get_dummies(df_01, columns = ['inspection_type', 'zip'])
df_01.head()

Unnamed: 0,results,violations_count,risk,critical_count,serious_count,minor_count,inspection_type_Canvass,inspection_type_Canvass Re-Inspection,inspection_type_Complaint,inspection_type_Complaint Re-Inspection,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,1,0.0,3,0.0,0.0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2.0,2,1.0,0.0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1,3.0,3,0.666667,0.0,0.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.0,3,0.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1.0,3,1.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


> Then, we build both the target dataframe (variable to predict) and the features dataframe.

In [33]:
# Make the features and target dataframes and display the features dataframe
target_df_01 = df_01.results
features_df_01 = df_01.drop(['results'], axis = 1)
features_df_01.head()

Unnamed: 0,violations_count,risk,critical_count,serious_count,minor_count,inspection_type_Canvass,inspection_type_Canvass Re-Inspection,inspection_type_Complaint,inspection_type_Complaint Re-Inspection,inspection_type_Short Form Complaint,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,3,0.0,0.0,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,2,1.0,0.0,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,3,0.666667,0.0,0.333333,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,3,0.0,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,3,1.0,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


> Now, we split the data randomly into a train (80%) and a test set (20%), using 1 as the random seed. 

In [34]:
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(features_df_01, target_df_01, test_size = 0.2, random_state = 1)

> Next, we optimize the different models according to their corresponding Hyper-parameters, and compute their accuracy on the test set.

In [35]:
# Ridge classification:

# Optimization of the hyperparameter alpha
best_param_val_r_01,ridgepipe = Ridge_classification_optimization(X_train_01, y_train_01)

# Build the model with the best parameter and test it on the test set.
Ridge_classification(best_param_val_r_01, X_train_01, y_train_01, X_test_01, y_test_01,ridgepipe)

Ridge classifer accuracy = 0.8198


In [40]:
# Random Forest classification:

# Optimization of the hyperparameter 
best_param_val_rf_01,pipeline_rf = Rf_optimization(X_train_01, y_train_01)

# Build the model with the best parameter and compute its accuracy on the test set.
Rf_classification(best_param_val_rf_01, X_train_01, y_train_01, X_test_01, y_test_01,pipeline_rf)

rf classifer accuracy = 0.8824


In [44]:
# Gradient Boosted Trees classification:

# Optimization of the hyperparameter 
best_param_val_GBT_01 = GBT_optimization(X_train_01, y_train_01)

# Build the model with the best parameter and compute its accuracy on the test set.
GBT_classification(best_param_val_GBT_01, X_train_01, y_train_01, X_test_01, y_test_01)

Gradient Boosted trees model score : 0.8868036937742032


**Conclusions**: The model that gave the higher accuracy (~0.89) for the set of features used is the Gradient Boosted Trees. However, this accuracy is extremely high and could lead one to think that some of the used features (related to violations) are future variables and dominate the others. 

## 2) Predecting the inspection type from the violations

> Here, we demonstrate the feasibility of a machine learning model that can predict the inspection type (e.g. poisoning), given the result of the inspection and the different commited violations in particular, for a given food facility.

### Preprocessing


> For each inspection shown below, we retrieved and encoded with a binary value the different violations committed during that inspection.

In [45]:
violations_dataframe02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,#42,#43,#44,#70,inspection_id,license,zip,risk,results,inspection_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2346127,2637127.0,60630.0,Risk 1 (High),Pass,Canvass Re-Inspection
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2346126,2432147.0,60632.0,Risk 2 (Medium),Pass w/ Conditions,Canvass Re-Inspection
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,2346149,2215652.0,60652.0,Risk 1 (High),Fail,Canvass
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2346103,2215467.0,60623.0,Risk 1 (High),Pass w/ Conditions,Canvass
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2346063,2215787.0,60625.0,Risk 1 (High),Pass w/ Conditions,Canvass


> As usual, we select the features and target variable from the dataframe, and turn categorical data into numerical features. 

In [46]:
# Drop the unacessary columns
df_02 = violations_dataframe02.drop(['inspection_id', 'license'], axis = 1)

# Turn 'risk' and 'results' into ordinal numerical features
df_02['risk'] = df_02['risk'].replace(risk_mapper).astype(int)
df_02['results'] = df_02['results'].replace(result_mapper).astype(int)

# Hot encoding for the feature 'zip'
df_02 = pd.get_dummies(df_02, columns = ['zip'])

df_02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


> Next, we need to transform the target categorical variable ('inspection_type') into a numerical variable in order to use it in our model.

In [47]:
# Check the different types of inspections
df_02['inspection_type'].unique()

array(['Canvass Re-Inspection', 'Canvass', 'Complaint',
       'Short Form Complaint', 'Complaint Re-Inspection',
       'Suspected Food Poisoning',
       'Suspected Food Poisoning Re-inspection'], dtype=object)

In [48]:
# Turn categorical target into numerical target
inspection_mapper = {'Canvass':1,'Complaint':2, 'Canvass Re-Inspection':3,
       'Short Form Complaint':4, 'Suspected Food Poisoning':5,
       'Complaint Re-Inspection':6,
       'Suspected Food Poisoning Re-inspection':7}

# Replace it in the dataframe
df_02['inspection_type'] = df_02['inspection_type'].replace(inspection_mapper).astype(int)

df_02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Check for the conversion
df_02['inspection_type'].unique()

array([3, 1, 2, 4, 6, 5, 7])

> Then, we build the features and target dataframes.

In [50]:
# Make the features and target dataframes and display the features dataframe
target_df_02 = df_02.inspection_type
features_df_02 = df_02.drop(['inspection_type'], axis = 1)
features_df_02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


> Now, we split the data randomly into a train (80%) and a test set (20%), using 1 as the random seed. Next, we optimize the different models according to their corresponding Hyper-parameters, and compute their accuracy on the test set.

In [51]:
# Split into train and test sets
X_train_02, X_test_02, y_train_02, y_test_02 = train_test_split(features_df_02, target_df_02, test_size = 0.2, random_state = 1)

In [53]:
# Ridge classification:

# Optimization of the hyperparameter alpha
best_param_val_r_02,ridgepipe_02 = Ridge_classification_optimization(X_train_02, y_train_02)

# Build the model with the best parameter and test it on the test set.
Ridge_classification(best_param_val_r_02, X_train_02, y_train_02, X_test_02, y_test_02,ridgepipe_02)

Ridge classifer accuracy = 0.6157


In [54]:
# Random Forest classification:

# Optimization of the hyperparameter 
best_param_val_rf_02,ridgepipe_rf_02 = Rf_optimization(X_train_02, y_train_02)

# Build the model with the best parameter and compute its accuracy on the test set.
Rf_classification(best_param_val_rf_02, X_train_02, y_train_02, X_test_02, y_test_02,ridgepipe_rf_02)

rf classifer accuracy = 0.6114


In [55]:
# Gradient Boosted Trees classification:

# Optimization of the hyperparameter 
best_param_val_GBT_02 = GBT_optimization(X_train_02, y_train_02)

# Build the model with the best parameter and compute its accuracy on the test set.
GBT_classification(best_param_val_GBT_02, X_train_02, y_train_02, X_test_02, y_test_02)

Gradient Boosted trees model score : 0.6159517426273459


**Conclusions**: The classification accuracy is quite low overall. This means that our models will predict the correct inspection type (e.g. the poisoning cases) with an accuracy of ~61%. We want to do better for the future and we believe that this could be achieved by a better preprocessing (e.g. better feature selection) of the data.

# 3) Predicting inspection due to food poisoning

> In this part, we focus only on predicting the inspection due to food poisoning . If it is the case , then we can predict if the food facility is not safe based on other inspections type.

### Preprocessing


> The only difference from the task above is that we will encode the inspection type as follow : 
- 1 for inspection due to suspected food poisoning
- 0 for the others ( canvas, complains and  re-inspections)

In [95]:
df_03 = violations_dataframe02.drop(['inspection_id', 'license'], axis = 1)
df_03.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,#40,#41,#42,#43,#44,#70,zip,risk,results,inspection_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60630.0,Risk 1 (High),Pass,Canvass Re-Inspection
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60632.0,Risk 2 (Medium),Pass w/ Conditions,Canvass Re-Inspection
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60652.0,Risk 1 (High),Fail,Canvass
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60623.0,Risk 1 (High),Pass w/ Conditions,Canvass
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60625.0,Risk 1 (High),Pass w/ Conditions,Canvass


In [96]:
# Associate a number to each value of the risk factor
risk_mapper = {'Risk 1 (High)':3,'Risk 2 (Medium)':2,'Risk 3 (Low)':1}

# Associate a number to each value of the results
result_mapper = {'Pass':1,'Pass w/ Conditions':0,'Fail':-1}

#associate a binary value in function of the inspection type
inspection_mapper = {'Canvass Re-Inspection':0,'Canvass':0,'Complaint':0,'Complaint Re-Inspection':0,'Suspected Food Poisoning Re-inspection':0,'Short Form Complaint':0,'Suspected Food Poisoning':1}

# Replace them in the dataframe
df_03['risk'] = df_03['risk'].replace(risk_mapper).astype(int)
df_03['results'] = df_03['results'].replace(result_mapper).astype(int)
df_03['inspection_type'] = df_03['inspection_type'].replace(inspection_mapper).astype(int)

df_03.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,#40,#41,#42,#43,#44,#70,zip,risk,results,inspection_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60630.0,3,1,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60632.0,2,0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60652.0,3,-1,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60623.0,3,0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60625.0,3,0,0


In [97]:
# Check for the conversion
df_03['inspection_type'].unique()

array([0, 1])

In [99]:
# Hot encoding for the feature 'zip'
df_03 = pd.get_dummies(df_03, columns = ['zip'])

In [100]:
# Make the features and target dataframes and display the features dataframe
target_df_03 = df_03.inspection_type
features_df_03 = df_03.drop(['inspection_type'], axis = 1)
features_df_03.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
X_train_03, X_test_03, y_train_03, y_test_03 = train_test_split(features_df_03, target_df_03, test_size = 0.2, random_state = 1)

In [102]:
# Ridge classification:

# Optimization of the hyperparameter alpha
best_param_val_r_03,ridgepipe_03 = Ridge_classification_optimization(X_train_03, y_train_03)

# Build the model with the best parameter and test it on the test set.
Ridge_classification(best_param_val_r_03, X_train_03, y_train_03, X_test_03, y_test_03,ridgepipe_03)

Ridge classifer accuracy = 0.9933


In [105]:
# Random Forest classification:

# Optimization of the hyperparameter 
best_param_val_rf_03,ridgepipe_rf_03 = Rf_optimization(X_train_03, y_train_03)

# Build the model with the best parameter and compute its accuracy on the test set.
Rf_classification(best_param_val_rf_03, X_train_03, y_train_03, X_test_03, y_test_03,ridgepipe_rf_03)

rf classifer accuracy = 0.9926


In [106]:
# Gradient Boosted Trees classification:

# Optimization of the hyperparameter 
best_param_val_GBT_03 = GBT_optimization(X_train_03, y_train_03)

# Build the model with the best parameter and compute its accuracy on the test set.
GBT_classification(best_param_val_GBT_03, X_train_03, y_train_03, X_test_03, y_test_03)

Gradient Boosted trees model score : 0.9932975871313673


**Conclusions:** By changing the way we encoded the *inspection_type* , we could predict in a more accurate way the inspection due to suspected food poisoning. But it seems that this accuracy is to high to be real. This is probably due to overfitting , but we need to investigate more about it. 

**Future Work :** 
- Investigate more about the accuracy value which seems to be very high (overfitting?). Try to overcome this issue by improving our model.
- Find statistical coefficient for the features used to predict the inspection due to food poisoning.
- Create a score using weights reflecting the stastistical coefficients . The aim of such a score is to evaluate food safety within a facility based on the inspection made by staff from the Chicago Department of Public Health’s Food Protection Program using a standardized procedure. Using this score , we can try to prevent food poisoning based on the outcome of other investigations as canvas , complaint... 
- Finaly, we can test the workability of our score based on real poisoning complaint from [I was poisoned](https://iwaspoisoned.com/location/united-states/illinois/chicago#botpress).