## Assignment 4 - Understanding and Predicting Property Maintenance Fines

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier

def blight_model():
    
    train_df = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
    test_df = pd.read_csv('test.csv')
    address_df =  pd.read_csv('addresses.csv')
    latlons_df = pd.read_csv('latlons.csv')
    add_lalo_df = address_df.set_index('address').join(latlons_df.set_index('address'), how='left').dropna().reset_index(drop=False)
    
    # merge train and test df (index = ticked_id)
    train_df = pd.merge(train_df, add_lalo_df, on='ticket_id').set_index('ticket_id')
    test_df = pd.merge(test_df, add_lalo_df, on='ticket_id',how='left').set_index('ticket_id')
    test_df['lat'].fillna(method='pad', inplace=True)
    test_df['lon'].fillna(method='pad', inplace=True)
    
    # where compliance 0 or 1
    train_df = train_df[(train_df['compliance'] == 0) | (train_df['compliance'] == 1)]
    train_df['compliance'] = train_df['compliance'].astype(int)
   
    # Calculate time difference
    from datetime import datetime
    def time_gap(hearing_date_str, ticket_issued_date_str):
        if not hearing_date_str or type(hearing_date_str)!=str: return 73
        hearing_date = datetime.strptime(hearing_date_str, "%Y-%m-%d %H:%M:%S")
        ticket_issued_date = datetime.strptime(ticket_issued_date_str, "%Y-%m-%d %H:%M:%S")
        gap = hearing_date - ticket_issued_date
        return gap.days
    train_df['time_gap'] = train_df.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1)
    test_df['time_gap'] = test_df.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1)
   
    # Reomove train only columns
    trainOnly_columns = [
        'payment_amount', 
        'payment_date', 
        'payment_status', 
        'balance_due',
        'collection_status', 
        'compliance_detail'
    ]
    train_df.drop(trainOnly_columns, axis=1, inplace=True)
   
    # Feature to train
    feature_columns = [
        'agency_name',
        'violation_street_name',
        'state',
        'violation_code',
        'late_fee',
        'fine_amount',
        'discount_amount',
        'judgment_amount',
        'lat',
        'lon',
        'time_gap'
    ]

    convert_columns = {
        'agency_name':'category',
        'violation_street_name':'category',
        'state':'category',
        'violation_code':'category',
        'disposition':'category'
    }
   
    # convert category to cast a pandas object to a specified dtype
    for df in [train_df, test_df]:
        for col, col_type in convert_columns.items():
            if col in df:
                if col_type == 'category':
                    df[col] = df[col].astype(col_type)
                   
    # Convert cetegory to integers
    target_columns = train_df.select_dtypes(['category']).columns
    for df in [train_df, test_df]:
        df[target_columns] = df[target_columns].apply(lambda x: x.cat.codes)

    # Spilt training and testing data
    X = train_df[feature_columns].copy()
    y = train_df['compliance']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
   

    test = test_df[feature_columns].copy()
   
    # Train the model
    GBC = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 5, random_state = 0).fit(X_train, y_train)
   
    y_score_GBC = GBC.decision_function(X_test)
    fpr_GBC, tpr_GBC, _ = roc_curve(y_test, y_score_GBC)
    roc_auc_GBC  = auc(fpr_GBC, tpr_GBC)
    accuracy_GBC = GBC.score(X_test, y_test)
    print("accuracy = {:.4f}   AUC = {:.4f}".format(accuracy_GBC,  roc_auc_GBC))
   
    y_proba = GBC.predict_proba(test)[:,1]
    test['compliance'] = y_proba
   
    return test['compliance']

In [6]:
blight_model()

  if self.run_code(code, result):


accuracy = 0.9345   AUC = 0.7943


ticket_id
284932    0.085164
285362    0.017509
285361    0.071615
285338    0.081694
285346    0.093148
285345    0.084259
285347    0.094187
285342    0.358078
285530    0.031653
284989    0.028976
285344    0.101199
285343    0.026523
285340    0.027697
285341    0.105321
285349    0.088618
285348    0.080667
284991    0.029739
285532    0.030230
285406    0.025680
285001    0.066820
285006    0.049086
285405    0.019442
285337    0.027127
285496    0.073535
285497    0.061857
285378    0.018378
285589    0.025918
285585    0.059788
285501    0.074994
285581    0.018948
            ...   
376367    0.033359
376366    0.045163
376362    0.048706
376363    0.066754
376365    0.033359
376364    0.045163
376228    0.069807
376265    0.037796
376286    0.164842
376320    0.049111
376314    0.044557
376327    0.369626
376385    0.336621
376435    0.362578
376370    0.361890
376434    0.060415
376459    0.065290
376478    0.014239
376473    0.051495
376484    0.052418
376482    0.017099
37