# The problem and the dataset for this notebook are taken from "Detroit Blight Ticket Compliance" competition on Kaggle which is also available on coursera. (More details can be found in the PDF file in the same folder as this jupyter notebook)

# Importing the necessary packages/libraries

In [1]:
import warnings
warnings.filterwarnings(action='once')
import pandas as pd
import numpy as np
import datetime
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Defining functions for converting a date string to a python datetime object, finding the day of year for ticket issue date and finding gap between ticket issue date and hearing date. Also, functions for finding latitude and longitude using ticket id are defined.

In [2]:
def split_string(x) :
    if not isinstance(x, str) :
        return x
    return x.split()[0]
def date_time_object(x) :
    if not isinstance(x, str) :
        return x
    return datetime.date(int(x.split('-')[0]),int(x.split('-')[1]),int(x.split('-')[2]))
def find_day(x) :
    return (x - datetime.date(x.year, 1, 1)).days + 1
def find_gap(x) :
    if not isinstance(x['hearing_date'], datetime.date) :
        return np.nan
    return (x['hearing_date']-x['ticket_issued_date']).days
def find_lat(x) :
    return latlons_dict[addresses_dict[x]][0]
def find_lon(x) :
    return latlons_dict[addresses_dict[x]][1] 

  and should_run_async(code)


# Choosing the relevant columns from provided dataset, dropping any rows containing infinity or NA and partitioning dataset into feature vectors and labels.

In [3]:
train_df = pd.read_csv('train.csv', encoding='cp1252')
# Only certian columns in available training dataset are being retained. Those columns which provide information that is
# likely to generalize are retained. Columns that are missing in test data must be discarded in training data too.
train_df = train_df[['ticket_id', 'agency_name', 'city', 'state', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'disposition', 'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'compliance']]
# Dropping any rows containing NA
train_df = train_df.dropna()
train_df = train_df[(train_df['compliance'] == 1) | (train_df['compliance'] == 0)]
# Paritioning the data into feature vectors and labels
X_train = train_df.drop('compliance', axis=1)
y_train = train_df['compliance']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Obtaining new features using the provided features in dataset and obtaining one-hot vectors for columns of category type.

In [4]:
# Replacing different ways of writing Detroit with one form
X_train['city'] = X_train['city'].replace({'detroit':'Detroit', 'det':'Detroit', 'Det':'Detroit', 'DETROIT':'Detroit',
                                           'DEt':'Detroit', 'Det.':'Detroit'})
X_train['ticket_issued_date'] = X_train['ticket_issued_date'].apply(lambda x: date_time_object(x.split()[0]))
X_train['ticket_issued_day'] = X_train['ticket_issued_date'].apply(lambda x: find_day(x))
X_train['hearing_date'] = X_train['hearing_date'].apply(lambda x: split_string(x))
X_train['hearing_date'] = X_train['hearing_date'].apply(lambda x: date_time_object(x))
# Finding gap between ticket issue date and hearing date
X_train['gap_days'] = X_train.apply(lambda x: find_gap(x), axis = 'columns')
# Finding if city in mailing address is same as location of violation site
X_train['city_flag'] = (X_train['city'] == 'Detroit')*1.0
# Finding if state in mailing address is same as location of violation site
X_train['state_flag'] = (X_train['state'] == 'MI')*1.0
# Finding if country in mailing address is same as location of violation site
X_train['country_flag'] = (X_train['country'] == 'USA')*1.0
addresses_df = pd.read_csv('addresses.csv')
latlons_df = pd.read_csv('latlons.csv')
latlons_df['latlon'] = latlons_df.apply(lambda x: (x['lat'],x['lon']), axis = 'columns')
addresses_dict = dict(zip(addresses_df.ticket_id, addresses_df.address))
latlons_dict = dict(zip(latlons_df.address, latlons_df.latlon))
# Finding latitude of violation site using ticket id
X_train['lat'] =  X_train['ticket_id'].apply(lambda x: find_lat(x))
# Finding longitude of violation site using ticket id
X_train['lon'] =  X_train['ticket_id'].apply(lambda x: find_lon(x))
# Violation codes that have frequency less than 1% are being replaced by 'Other'
X_train['violation_code'] = X_train['violation_code'].mask(X_train['violation_code'].map(X_train['violation_code'].value_counts(normalize=True)) < 0.01, 'Other')
X_train = X_train[['agency_name', 'violation_code', 'disposition', 'fine_amount', 'late_fee', 'discount_amount',
                   'judgment_amount', 'ticket_issued_day', 'gap_days', 'city_flag', 'state_flag', 'country_flag',
                   'lat', 'lon']]
# Obtaining one-hot vectors for data columns of type category
X_train = pd.get_dummies(X_train, columns = ['agency_name', 'violation_code', 'disposition'])
X_y_train = pd.concat([X_train, y_train], axis=1, join='inner')
# Dropping rows containing infinite and NA values
X_y_train = X_y_train.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
y_train = X_y_train['compliance']
X_train = X_y_train.drop(['compliance'], axis=1)

  and should_run_async(code)


# Training a random forest classifier and optimizing hyper-parameters using grid search. Area under ROC curve is used as evaluation metric.

In [5]:
parameters = {'n_estimators': [100, 150], 'max_depth':[4, 5]}
clf_dummy = RandomForestClassifier()
# Performing a grid search to find optimum parameters for random forest classifier. Area under ROC curve is used as metric.
clf = GridSearchCV(clf_dummy, parameters, scoring = 'roc_auc')
X_cv = X_train[:143610]
X_test = X_train[143610:]
y_cv = y_train[:143610]
y_test = y_train[143610:]
clf.fit(X_cv, y_cv)
print("Area under ROC curve is", roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

  and should_run_async(code)


Area under ROC curve is 0.8113132510880079
