In [1]:
import pandas as pd
import numpy as np
import datetime as datetime



In [2]:
# Load train and test data
train_data = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
test_data = pd.read_csv('test.csv')

In [9]:
# Filter NULL values from compliance rows
train_data = train_data[(train_data['compliance'] == 0) | (train_data['compliance'] == 1)]
address = pd.read_csv('addresses.csv')

In [10]:
# Load addresses and location information
latlons = pd.read_csv('latlons.csv')
address = address.set_index('address').join(latlons.set_index('address'), how='left')


In [11]:
# Join address and location to train and test data
train_data = train_data.set_index('ticket_id').join(address.set_index('ticket_id'))
test_data = test_data.set_index('ticket_id').join(address.set_index('ticket_id'))

In [12]:
# Filter null valued hearing date rows
train_data = train_data[~train_data['hearing_date'].isnull()]


In [13]:
# Remove Non Existing Features In Test Data
train_remove_list = [
        'balance_due',
        'collection_status',
        'compliance_detail',
        'payment_amount',
        'payment_date',
        'payment_status'
    ]
train_data.drop(train_remove_list, axis=1, inplace=True)

In [15]:
# Remove String Data
string_remove_list = ['violator_name', 'zip_code', 'country', 'city',
        'inspector_name', 'violation_street_number', 'violation_street_name',
        'violation_zip_code', 'violation_description',
        'mailing_address_str_number', 'mailing_address_str_name',
        'non_us_str_code', 'agency_name', 'state', 'disposition',
        'ticket_issued_date', 'hearing_date', 'grafitti_status', 'violation_code'
    ]

train_data.drop(string_remove_list, axis=1, inplace=True)
test_data.drop(string_remove_list, axis=1, inplace=True)

In [16]:
train_data.head()

Unnamed: 0_level_0,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,compliance,lat,lon
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
22056,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0,42.390729,-83.124268
27586,750.0,20.0,10.0,75.0,0.0,0.0,855.0,1.0,42.326937,-83.135118
22046,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0,42.145257,-83.208233
18738,750.0,20.0,10.0,75.0,0.0,0.0,855.0,0.0,42.433466,-83.023493
18735,100.0,20.0,10.0,10.0,0.0,0.0,140.0,0.0,42.388641,-83.037858


In [17]:
# Fill NA Lat Lon Values
train_data.lat.fillna(method='pad', inplace=True)
train_data.lon.fillna(method='pad', inplace=True)
test_data.lat.fillna(method='pad', inplace=True)
test_data.lon.fillna(method='pad', inplace=True)


In [18]:
# Select target value as y train and remove it from x train
y_train = train_data.compliance
X_train = train_data.drop('compliance', axis=1)

In [19]:
# Do nothing with test data and select as x test, we don't have y_test
X_test = test_data

In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
# Scale Features To Reduce Computation Time
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    
# Build And Train Classifier Model
clf = MLPClassifier(hidden_layer_sizes = [10, 10],
                        alpha=0.01,
                        random_state = 0, 
                        solver='lbfgs', 
                        verbose=0)
clf.fit(X_train_scaled, y_train)
test_proba = clf.predict_proba(X_test_scaled)
result = test_proba[:,1]


In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Grid Search For Finding Optimal Values

grid_values = {'alpha': [0.001], 'hidden_layer_sizes': [[100, 10], [150, 10]]}
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train_scaled, y_train)
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)

Grid best parameter (max. AUC):  {'alpha': 0.001, 'hidden_layer_sizes': [150, 10]}
Grid best score (AUC):  0.7398522513099535


In [24]:
from sklearn.model_selection import cross_val_score
# accuracy is the default scoring metric
print('Cross-validation (accuracy)', cross_val_score(clf, X_train, y_train, cv=5))

Cross-validation (accuracy) [0.70740033 0.8345495  0.79020388 0.70663952 0.72483558]
