In [145]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import re

from scipy import stats
from sklearn.model_selection import train_test_split, validation_curve, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

> First, we load the dataset.

In [2]:
#opening the data
data= pd.read_csv('clean_dataset.csv',delimiter=',')

# drop the unnamed column
data.drop(['Unnamed: 0'], axis=1,inplace=True)


In [3]:
#show the dataframe
display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2345959,DUNKIN DONUTS,DUNKIN DONUTS,1803815.0,Restaurant,Risk 2 (Medium),4453 S ARCHER AVE,60632.0,2019-11-20,Canvass,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.812307,-87.706919,"{'longitude': '41.81230749019629', 'latitude':..."
1,2345980,A & J KRAZY KITCHEN,A & J KRAZY KITCHEN,2570118.0,Restaurant,Risk 1 (High),7547 W IRVING PARK RD,60634.0,2019-11-20,Complaint,Fail,25. CONSUMER ADVISORY PROVIDED FOR RAW/UNDERCO...,41.952437,-87.816496,"{'longitude': '41.95243739681394', 'latitude':..."
2,2345921,"NEW BANPOJUNG, INC.",Ban Po Chung,1847417.0,Restaurant,Risk 1 (High),3450 W FOSTER AVE,60625.0,2019-11-19,Canvass,Pass,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...,41.975778,-87.715618,"{'longitude': '41.97577826268285', 'latitude':..."


# Violation characterization 

> Next, we split the violations from the comments in the 'violations' column, and represent each violation with a binary value.

In [4]:
# these functions will be taken from class later. 

#function that split the violations number from the comments
def violation_separator(violations):
    
    #creating an empty dataframe in order to stock the violation numbers
    violation_number = pd.Series([]) 
    
    if type(violations) == str:
        #each different violation is separated by a ' | ' in a dataframe cell
        violations = violations.split(' | ') 
       
        for violation in violations: 
            #the index refers to the violation number
            index = "#" + violation.split('.')[0] 
            #add 1 if there is a violation #.. and 0 if not.
            violation_number[index] = 1
            
    return violation_number

In [55]:
# function that takes a data with  violations by number and categorize them by category, before doing the count 
def Violations_Dataframe (separated_data, basic_data): 
    
    #columns creation 
    critical = [("#" + str(num)) for num in range(1, 15)]
    serious = [("#" + str(num)) for num in range(15, 30)]
    minor = [("#" + str(num)) for num in range(30, 45)]
    minor.append("#70")

    # Create complete list of column names
    columns = critical + serious + minor

    # Create dataframe using column names, violation data and inspection ID
    violations_data = pd.DataFrame(separated_data, columns=columns)
    violations_data['inspection_id'] = basic_data.inspection_id
    violations_data['license'] = basic_data.license
    
    violation_counts = pd.DataFrame({
    "critical_count": violations_data[critical].sum(axis=1),
    "serious_count": violations_data[serious].sum(axis=1),
    "minor_count": violations_data[minor].sum(axis=1)
    })

    violation_counts['inspection_id'] = basic_data.inspection_id
    violation_counts['license'] = basic_data.license
    violation_counts['zip'] = basic_data.zip
    violation_counts['risk'] = basic_data.risk
    violation_counts['results'] = basic_data.results
    violation_counts['inspection_type'] = basic_data.inspection_type
    
    violations_data['inspection_id'] = basic_data.inspection_id
    violations_data['license'] = basic_data.license
    violations_data['zip'] = basic_data.zip
    violations_data['risk'] = basic_data.risk
    violations_data['results'] = basic_data.results
    violations_data['inspection_type'] = basic_data.inspection_type
    # Display selection of sums dataframe
    violation_counts.iloc[3:6]
    
    #calculation of the proportion of type of count in order to normalize this variable.
    violation_counts['violations_count'] = violation_counts.critical_count + violation_counts.serious_count+violation_counts.minor_count
    
    #we ajust the count in order to have a proportion
    violation_counts['critical_count']=violation_counts['critical_count'].divide(violation_counts["violations_count"])
    violation_counts['serious_count']=violation_counts['serious_count'].divide(violation_counts["violations_count"])
    violation_counts['minor_count']=violation_counts['minor_count'].divide(violation_counts["violations_count"])
    
    violation_counts.fillna(0,inplace=True)
    
    return violation_counts, violations_data

In [57]:
# We apply this functions to our basic data 
violation_separated= data.violations.apply(violation_separator).fillna(0)

In [58]:
#we create our dataframe with all the violations counts
violations_dataframe, violations_dataframe02 = Violations_Dataframe(violation_separated,data)

violations_dataframe.head()

Unnamed: 0,critical_count,serious_count,minor_count,inspection_id,license,zip,risk,results,inspection_type,violations_count
0,0.333333,0.333333,0.333333,2345959,1803815.0,60632.0,Risk 2 (Medium),Pass w/ Conditions,Canvass,3.0
1,0.0,1.0,0.0,2345980,2570118.0,60634.0,Risk 1 (High),Fail,Complaint,1.0
2,0.0,0.0,0.0,2345921,1847417.0,60625.0,Risk 1 (High),Pass,Canvass,0.0
3,0.0,0.0,0.0,2345906,2192977.0,60666.0,Risk 1 (High),Pass,Canvass,0.0
4,0.0,0.0,0.0,2345928,2215697.0,60629.0,Risk 1 (High),Pass,Canvass,0.0


# Applied Machine Learning 

> We decided to go for a classification model, since the variables that we want to predict (type of inspection and result) take limited discrete values (e.g. Fail, Pass, Pass w/ Conditions for the variable 'results').

> We tested different classification models in order to obtain the best accuracy. First, we need to optimize the Hyper-Parameter of each model, before computing the performance of the tuned model on the test set. Since these algorithms are going to be used several times, we created functions for the respective optimization and performance testing of each model.

### Ridge Classification 

In [146]:
def Ridge_classification_optimization(X_train, y_train):
    '''
    Optimize the regularization parameter 𝛼 of the Ridge classification model.
    
    Parameters
    -----------
    X_train, y_train: pandas.DataFrame
    
    Returns
    --------
    best_param_val_r: int
        The optimal 𝛼
    '''

    seed=0

    # Define a random classifier pipeline
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('ridge_clf', RidgeClassifier()))
    ridge_pipe = Pipeline(estimators)

    # We use the accuracy obtained on the cross validation folds as a metric
    score = 'accuracy'
    n_splits = 3

    # Set up the cross-validation generator for later use in the 'validation_curve' function
    cv_schema = KFold(n_splits = n_splits, random_state = seed)

    # Tune the model against a single hyper parameter
    tuning_param = 'ridge_clf__alpha'
    tuning_param_range = np.logspace(-4, 5, 10)

    # Compute training and test scores for varying parameter values using 'validation_curve' 
    train_scores_val, cv_scores_val = validation_curve(
        ridge_pipe, X_train, y_train, param_name = tuning_param, param_range = tuning_param_range,
        cv = cv_schema, scoring = score, n_jobs = -1)

    # Obtain the best value of the hyper parameter, and the highest accuracy score
    best_param_val_r = tuning_param_range[np.argmax(np.mean(cv_scores_val, axis=1))]
    best_ridge_acc = max(np.mean(cv_scores_val, axis=1))
    #print('Best alpha : ' + str(best_param_val_r))
    #print('Best ridge accuracy : ' + str(best_ridge_acc))

    return best_param_val_r

In [147]:
def Ridge_classification(best_param_val_r, X_train, y_train, X_test, y_test):
    '''
    Create the model for ridge classification with the best parameter 𝛼.
    Compute the performance of the tuned model on test set.
    
    Parameters
    -----------
    best_param_val_r: float
        The optimal 𝛼
        
    X_train, y_train, X_test, y_test: pandas.DataFrame
        Training and test sets
    
    Returns
    -------
    null
    '''
    ridge_pipe.set_params(ridge_clf__alpha = best_param_val_r)
    ridge_pipe.fit(X_train,y_train)
    y_pred = ridge_pipe.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print ('Ridge classifer accuracy = %2.4f' %score)

### Random Forest

In [148]:
def Rf_optimization(X_train, y_train):
    '''
    Optimize the regularization parameter of the Random Forest classification model.
    
    Parameters
    -----------
    X_train, y_train: pandas.DataFrame
    
    Returns
    --------
    best_param_val_rf: int
        The optimal n_estimators
    '''
    seed = 1

    # Define a random classifier pipeline
    estimators = []
    estimators.append(('rf_clf', RandomForestClassifier()))
    pipeline = Pipeline(estimators)

    # Fixed parameters
    score = 'accuracy'
    tuning_param_range = [int(i) for i in np.linspace(10, 100, 10)]
    tuning_param = 'rf_clf__n_estimators'
    cv_schema = KFold(n_splits = 5, random_state = seed)

    # Tune hyper parameter using validation curve
    train_scores_val, cv_scores_val = validation_curve(
        pipeline, X_train, y_train, param_name = tuning_param, param_range = tuning_param_range,
        cv = cv_schema, scoring = score, n_jobs = -1)

    # Obtain the best value of the hyper parameter, with the best accuracy score
    best_param_val_rf = tuning_param_range[np.argmax(np.mean(cv_scores_val, axis=1))]
    best_rf_acc = max(np.mean(cv_scores_val, axis=1))

    #print ('best n_estimators = %d with accuracy score = %2.4f' %(best_param_val_rf, best_rf_acc))
    
    return best_param_val_rf

In [149]:
def Rf_classification(best_param_val_rf, X_train, y_train, X_test, y_test):
    '''
    Create the model for Random Forest classification with the best parameter value.
    Compute the performance of the tuned model on the test set.
    
    Parameters
    -----------
    best_param_val_rf: int
        The best parameter value
        
    X_train, y_train, X_test, y_test: pandas.DataFrame
        Training and test sets
    
    Returns
    -------
    null
    '''
    pipeline.set_params(rf_clf__n_estimators = best_param_val_rf)
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print ('rf classifer accuracy = %2.4f' %score)

### Gradient Boosted Trees

In [150]:
def GBT_optimization(X_train, y_train):
    '''
    Optimize the regularization parameter of the Gradient Boosted Trees classification model.
    
    Parameters
    -----------
    X_train, y_train: pandas.DataFrame
    
    Returns
    --------
    best_param_val_GBT: int
        The optimal Hyper-parameter
    '''

    estimators = []
    estimators.append(('XGB_clf', XGBClassifier()))
    gb_pipe = Pipeline(estimators)

    # Fixed parameters
    score = 'accuracy'
    n_splits = 3

    # CV schema
    cv_schema = KFold(n_splits = n_splits, random_state = seed)

    # Tune model against a single hyper parameter
    tuning_param = 'XGB_clf__n_estimators'
    tuning_param_range = [int(i) for i in np.linspace(10.0, 30.0, 5)]

    # Tune hyper parameter using validation curve
    train_scores_val, cv_scores_val = validation_curve(
        gb_pipe, X_train, y_train, param_name = tuning_param, param_range = tuning_param_range,
        cv = cv_schema, scoring = score, n_jobs = -1)

    # Obtain the best value of the hyper parameter
    best_param_val_GBT = tuning_param_range[np.argmax(np.mean(cv_scores_val, axis=1))]
    best_xgb_acc = max(np.mean(cv_scores_val, axis=1))
    #print(best_param_val_GBT)
    #print(best_xgb_acc)
    
    return best_param_val_GBT 

In [151]:
def GBT_classification(best_param_val_GBT, X_train, y_train, X_test, y_test):
    '''
    Create the model for Gradient Boosted Trees classification with the best parameter value.
    Compute the performance of the tuned model on the test set.
    
    Parameters
    -----------
    best_param_val_GBT: int
        The best parameter value
        
    X_train, y_train, X_test, y_test: pandas.DataFrame
        Training and test sets
    
    Returns
    -------
    null
    '''
    optimal_xgb =  XGBClassifier(n_estimators=best_param_val_GBT)
    optimal_xgb.fit(X_train,y_train)
    xgb_pred = optimal_xgb.predict(X_test)
    acc_xgb = accuracy_score(y_test, xgb_pred)

    print("Gradient Boosted trees model score : " + str(acc_xgb))

## 1) Predecting the inspection result

> In this section, we provide a proof of concept for a machine learning model that can predict the inspection outcome (result) for a given food facility.

### Preprocessing

> The features that we decided to use are the following; we can safely discard the rest of the columns.

In [14]:
#features and target variable
restaurant_data = ['results', 'inspection_type',
       'violations_count', 'zip', 'risk', 'critical_count',
       'serious_count', 'minor_count']

> We start by creating a dataframe containing the columns of interest.

In [85]:
df_01 = violations_dataframe[restaurant_data]
df_01.head()

Unnamed: 0,results,inspection_type,violations_count,zip,risk,critical_count,serious_count,minor_count
0,Pass w/ Conditions,Canvass,3.0,60632.0,Risk 2 (Medium),0.333333,0.333333,0.333333
1,Fail,Complaint,1.0,60634.0,Risk 1 (High),0.0,1.0,0.0
2,Pass,Canvass,0.0,60625.0,Risk 1 (High),0.0,0.0,0.0
3,Pass,Canvass,0.0,60666.0,Risk 1 (High),0.0,0.0,0.0
4,Pass,Canvass,0.0,60629.0,Risk 1 (High),0.0,0.0,0.0


> Since our final goal will be to train a linear machine learning model, all of the features we use need to be numerical. Some features like the 'inspection_type' and 'risk' are non-numeric and non-atomic. We transform these features into numerical features. As the risk feature is ordinal (from low to high), we choose to encode it accordingly instead of using one hot encoding (get_dummies). We do the same for the inspection result.

In [86]:
# Associate a number to each value of the risk factor
risk_mapper = {'Risk 1 (High)':3,'Risk 2 (Medium)':2,'Risk 3 (Low)':1}

# Associate a number to each value of the results
result_mapper = {'Pass':1,'Pass w/ Conditions':0,'Fail':-1}

# Replace them in the dataframe
df_01['risk'] = df_01['risk'].replace(risk_mapper).astype(int)
df_01['results'] = df_01['results'].replace(result_mapper).astype(int)

df_01.head()

Unnamed: 0,results,inspection_type,violations_count,zip,risk,critical_count,serious_count,minor_count
0,0,Canvass,3.0,60632.0,2,0.333333,0.333333,0.333333
1,-1,Complaint,1.0,60634.0,3,0.0,1.0,0.0
2,1,Canvass,0.0,60625.0,3,0.0,0.0,0.0
3,1,Canvass,0.0,60666.0,3,0.0,0.0,0.0
4,1,Canvass,0.0,60629.0,3,0.0,0.0,0.0


> We encode the other features using 'hot encoding' or get_dummies.

In [87]:
# Hot encoding of the 'zip' and 'inspection_type' features
df_01 = pd.get_dummies(df_01, columns = ['inspection_type', 'zip'])
df_01.head()

Unnamed: 0,results,violations_count,risk,critical_count,serious_count,minor_count,inspection_type_Canvass,inspection_type_Canvass Re-Inspection,inspection_type_Complaint,inspection_type_Complaint Re-Inspection,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0,3.0,2,0.333333,0.333333,0.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1,1.0,3,0.0,1.0,0.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.0,3,0.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,3,0.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0.0,3,0.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


> Then, we build both the target dataframe (variable to predict) and the features dataframe.

In [88]:
# Make the features and target dataframes and display the features dataframe
target_df_01 = df_01.results
features_df_01 = df_01.drop(['results'], axis = 1)
features_df_01.head()

Unnamed: 0,violations_count,risk,critical_count,serious_count,minor_count,inspection_type_Canvass,inspection_type_Canvass Re-Inspection,inspection_type_Complaint,inspection_type_Complaint Re-Inspection,inspection_type_Short Form Complaint,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,3.0,2,0.333333,0.333333,0.333333,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,3,0.0,1.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,3,0.0,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,3,0.0,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,3,0.0,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
features_without_v = features_df.drop(['violations_count', "critical_count", "serious_count", "minor_count"], axis = 1)
features_without_v.shape

(134216, 68)

> Now, we split the data randomly into a train (80%) and a test set (20%), using 1 as the random seed. 

In [98]:
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(features_df_01, target_df_01, test_size = 0.2, random_state = 1)

> Next, we optimize the different models according to their corresponding Hyper-parameters, and compute their accuracy on the test set.

In [109]:
# Ridge classification:

# Optimization of the hyperparameter alpha
best_param_val_r_01 = Ridge_classification_optimization(X_train_01, y_train_01)

# Build the model with the best parameter and test it on the test set.
Ridge_classification(best_param_val_r_01, X_train_01, y_train_01, X_test_01, y_test_01)

Ridge classifer accuracy = 0.8232


In [143]:
# Random Forest classification:

# Optimization of the hyperparameter 
best_param_val_rf_01 = Rf_optimization(X_train_01, y_train_01)

# Build the model with the best parameter and compute its accuracy on the test set.
Rf_classification(best_param_val_rf_01, X_train_01, y_train_01, X_test_01, y_test_01)

rf classifer accuracy = 0.8817


In [127]:
# Gradient Boosted Trees classification:

# Optimization of the hyperparameter 
best_param_val_GBT_01 = GBT_optimization(X_train_01, y_train_01)

# Build the model with the best parameter and compute its accuracy on the test set.
GBT_classification(best_param_val_GBT_01, X_train_01, y_train_01, X_test_01, y_test_01)

Gradient Boosted trees model score : 0.8881686782893756


**Conclusions**: The model that gave the higher accuracy (~0.89) for the set of features used is the Gradient Boosted Trees. However, this accuracy is extremely high and could lead one to think that some of the used features (related to violations) are future variables and dominate the others. 

## 2) Predecting the inspection type from the violations

> Here, we demonstrate the feasibility of a machine learning model that can predict the inspection type (e.g. poisoning), given the result of the inspection and the different commited violations in particular, for a given food facility.

### Preprocessing


> For each inspection shown below, we retrieved and encoded with a binary value the different violations committed during that inspection.

In [133]:
violations_dataframe02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,#42,#43,#44,#70,inspection_id,license,zip,risk,results,inspection_type
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2345959,1803815.0,60632.0,Risk 2 (Medium),Pass w/ Conditions,Canvass
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2345980,2570118.0,60634.0,Risk 1 (High),Fail,Complaint
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2345921,1847417.0,60625.0,Risk 1 (High),Pass,Canvass
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2345906,2192977.0,60666.0,Risk 1 (High),Pass,Canvass
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2345928,2215697.0,60629.0,Risk 1 (High),Pass,Canvass


> As usual, we select the features and target variable from the dataframe, and turn categorical data into numerical features. 

In [134]:
# Drop the unacessary columns
df_02 = violations_dataframe02.drop(['inspection_id', 'license'], axis = 1)

# Turn 'risk' and 'results' into ordinal numerical features
df_02['risk'] = df_02['risk'].replace(risk_mapper).astype(int)
df_02['results'] = df_02['results'].replace(result_mapper).astype(int)

# Hot encoding for the feature 'zip'
df_02 = pd.get_dummies(df_02, columns = ['zip'])

df_02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


> Next, we need to transform the target categorical variable ('inspection_type') into a numerical variable in order to use it in our model.

In [135]:
# Check the different types of inspections
df_02['inspection_type'].unique()

array(['Canvass', 'Complaint', 'Canvass Re-Inspection',
       'Short Form Complaint', 'Suspected Food Poisoning',
       'Complaint Re-Inspection',
       'Suspected Food Poisoning Re-inspection'], dtype=object)

In [136]:
# Turn categorical target into numerical target
inspection_mapper = {'Canvass':1,'Complaint':2, 'Canvass Re-Inspection':3,
       'Short Form Complaint':4, 'Suspected Food Poisoning':5,
       'Complaint Re-Inspection':6,
       'Suspected Food Poisoning Re-inspection':7}

# Replace it in the dataframe
df_02['inspection_type'] = df_02['inspection_type'].replace(inspection_mapper).astype(int)

df_02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
# Check for the conversion
df_02['inspection_type'].unique()

array([1, 2, 3, 4, 5, 6, 7])

> Then, we build the features and target dataframes.

In [138]:
# Make the features and target dataframes and display the features dataframe
target_df_02 = df_02.inspection_type
features_df_02 = df_02.drop(['inspection_type'], axis = 1)
features_df_02.head()

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,...,zip_60654.0,zip_60655.0,zip_60656.0,zip_60657.0,zip_60659.0,zip_60660.0,zip_60661.0,zip_60666.0,zip_60707.0,zip_60827.0
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


> Now, we split the data randomly into a train (80%) and a test set (20%), using 1 as the random seed. Next, we optimize the different models according to their corresponding Hyper-parameters, and compute their accuracy on the test set.

In [139]:
# Split into train and test sets
X_train_02, X_test_02, y_train_02, y_test_02 = train_test_split(features_df_02, target_df_02, test_size = 0.2, random_state = 1)

In [140]:
# Ridge classification:

# Optimization of the hyperparameter alpha
best_param_val_r_02 = Ridge_classification_optimization(X_train_02, y_train_02)

# Build the model with the best parameter and test it on the test set.
Ridge_classification(best_param_val_r_02, X_train_02, y_train_02, X_test_02, y_test_02)

Ridge classifer accuracy = 0.6149


In [141]:
# Random Forest classification:

# Optimization of the hyperparameter 
best_param_val_rf_02 = Rf_optimization(X_train_02, y_train_02)

# Build the model with the best parameter and compute its accuracy on the test set.
Rf_classification(best_param_val_rf_02, X_train_02, y_train_02, X_test_02, y_test_02)

rf classifer accuracy = 0.6089


In [142]:
# Gradient Boosted Trees classification:

# Optimization of the hyperparameter 
best_param_val_GBT_02 = GBT_optimization(X_train_02, y_train_02)

# Build the model with the best parameter and compute its accuracy on the test set.
GBT_classification(best_param_val_GBT_02, X_train_02, y_train_02, X_test_02, y_test_02)

Gradient Boosted trees model score : 0.6146997466845477


**Conclusions**: The classification accuracy is quite low overall. This means that our models will predict the correct inspection type (e.g. the poisoning cases) with an accuracy of ~61%. We want to do better for the future and we believe that this could be achieved by a better preprocessing (e.g. better feature selection) of the data.