In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler


def blight_model():
    # Load the data
    train_df = pd.read_csv("readonly/train.csv",encoding = 'ISO-8859-1')
    test_df = pd.read_csv("readonly/test.csv")
    # Select the necessary data
    train_df = train_df[(train_df['compliance'] == 0) | (train_df['compliance'] == 1)]
    # Load the addresses and latlons data
    address_df = pd.read_csv("readonly/addresses.csv")
    latlons_df = pd.read_csv("readonly/latlons.csv")
    
    # Join the address data with the latlons
    address_df = address_df.set_index('address').join(latlons_df.set_index('address'), how='left')
    train_df = train_df.set_index('ticket_id').join(address_df.set_index('ticket_id'))
    test_df =  test_df.set_index('ticket_id').join(address_df.set_index('ticket_id'))

    # Filter null valued hearing date rows
    train_df = train_df[~train_df['hearing_date'].isnull()]

    # Remove  unnecessary Training Data columns
    train_del = [
            'balance_due',
            'collection_status',
            'compliance_detail',
            'payment_amount',
            'payment_date',
            'payment_status'
        ]

    train_df.drop(train_del, axis=1, inplace=True)

    # Remove String Data columns
    string_del = ['violator_name', 'zip_code', 'country', 'city',
            'inspector_name', 'violation_street_number', 'violation_street_name',
            'violation_zip_code', 'violation_description',
            'mailing_address_str_number', 'mailing_address_str_name',
            'non_us_str_code', 'agency_name', 'state', 'disposition',
            'ticket_issued_date', 'hearing_date', 'grafitti_status', 'violation_code'
        ]
    train_df.drop(string_del, axis=1, inplace=True)
    test_df.drop(string_del, axis=1, inplace=True)

    # Fill NA LatLon Values
    train_df.lat.fillna(method='pad', inplace=True)
    train_df.lon.fillna(method='pad', inplace=True)
    test_df.lat.fillna(method='pad', inplace=True)
    test_df.lon.fillna(method='pad', inplace=True)

    # Get the dependant variable as an y-value and get rest of the values as an x-value
    y_train = train_df.compliance
    X_train = train_df.drop('compliance', axis=1)

    # Nothing to do with test data
    X_test = test_df
    
    # Use feature scaling to minimize computational time
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Build And Train Classifier Model
    clf = MLPClassifier(hidden_layer_sizes = [100, 10],
                        alpha=0.001,
                        random_state = 0, 
                        solver='lbfgs', 
                        verbose=0)
    clf.fit(X_train_scaled, y_train)
    
    # Predict probabilities
    y_proba = clf.predict_proba(X_test_scaled)[:,1]
    
    # Integrate with reloaded test data
    test_df = pd.read_csv('readonly/test.csv', encoding = "ISO-8859-1")
    test_df['compliance'] = y_proba
    test_df.set_index('ticket_id', inplace=True)
    return test_df.compliance

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler


def blight_model():
    # Load the data
    train_df = pd.read_csv("readonly/train.csv",encoding = 'ISO-8859-1')
    test_df = pd.read_csv("readonly/test.csv")
    # Select the necessary data
    train_df = train_df[(train_df['compliance'] == 0) | (train_df['compliance'] == 1)]
    # Drop any NA data for training
    train_df = train_df.dropna(axis = 1,how  = 'all')
    train_df = train_df.dropna(axis = 0,how  = 'all')
    # Load the addresses and latlons data
    address_df = pd.read_csv("readonly/addresses.csv")
    latlons_df = pd.read_csv("readonly/latlons.csv")
    
    # Join the address data with the latlons
    address_df = address_df.set_index('address').join(latlons_df.set_index('address'), how='left')
    train_df = train_df.set_index('ticket_id').join(address_df.set_index('ticket_id'))
    test_df =  test_df.set_index('ticket_id').join(address_df.set_index('ticket_id'))



    # Remove  unnecessary Training Data columns
    train_del = [
            'balance_due',
            'collection_status',
            'compliance_detail',
            'payment_amount',
            'payment_date',
            'payment_status'
        ]

    train_df.drop(train_del, axis=1, inplace=True)
    test_df.drop('violation_zip_code',axis = 1,inplace = True)
    # Remove String Data columns
    string_del = ['violator_name',
       'violation_street_number', 'violation_street_name',
       'mailing_address_str_number', 'mailing_address_str_name', 'city',
       'state', 'zip_code', 'country','non_us_str_code','violation_description',
        'inspector_name','clean_up_cost','violation_code'
        ]
    train_df.drop(string_del, axis=1, inplace=True)
    test_df.drop(string_del, axis=1, inplace=True)
    
    # Combine some Features
    train_df['Total_amount_to_pay'] = train_df['fine_amount']+ train_df['admin_fee']+train_df['state_fee']+train_df['late_fee']-train_df['discount_amount']
    list_drop = ['fine_amount','admin_fee','state_fee','late_fee','discount_amount']
    train_df.drop(list_drop,axis = 1,inplace = True)
    
    test_df['Total_amount_to_pay'] = test_df['fine_amount']+ test_df['admin_fee']+test_df['state_fee']+test_df['late_fee']-test_df['discount_amount']
    test_df.drop(list_drop,axis = 1,inplace = True)
    
    
    train_df.dropna(subset = ['lat','lon','Total_amount_to_pay'],inplace = True) 
    test_df['lat'].fillna(test_df.lat.mean(),inplace = True)
    test_df['lon'].fillna(test_df.lon.mean(),inplace = True)

    
    # Deal with time gap
    from datetime import date
    train_df['hearing_date'] = pd.to_datetime(train_df['hearing_date'])
    train_df['ticket_issued_date'] = pd.to_datetime(train_df['ticket_issued_date'])
    test_df['hearing_date'] = pd.to_datetime(test_df['hearing_date'])
    test_df['ticket_issued_date'] = pd.to_datetime(test_df['ticket_issued_date'])
    
    train_df['hearing_date'].fillna(method = 'pad',inplace = True)
    train_df['ticket_issued_date'].fillna(method = 'pad',inplace = True)
    test_df['hearing_date'].fillna(method = 'pad',inplace = True)
    test_df['ticket_issued_date'].fillna(method = 'pad',inplace = True)
    
    train_df['time_gap'] = train_df['hearing_date'].subtract(train_df['ticket_issued_date'])
    train_df['time_gap'] = train_df['time_gap'].dt.days
    train_df['time_gap'].fillna(73,inplace = True)
    train_df.drop(['hearing_date','ticket_issued_date'],axis = 1,inplace = True)
    
    test_df['time_gap'] = test_df['hearing_date'].subtract(test_df['ticket_issued_date'])
    test_df['time_gap'] = test_df['time_gap'].dt.days
    test_df['time_gap'].fillna(73,inplace = True)
    test_df.drop(['hearing_date','ticket_issued_date'],axis = 1,inplace = True)
    
    # Fill NA LatLon Values
    train_df.dropna(subset = ['lat','lon','Total_amount_to_pay'],inplace = True) 
    test_df['lat'].fillna(test_df.lat.mean(),inplace = True)
    test_df['lon'].fillna(test_df.lon.mean(),inplace = True)

    
    
    # Get the dependant variable as an y-value and get rest of the values as an x-value 
    y_train = train_df['compliance']
    X_train = train_df.drop('compliance',axis = 1)
    X_test = test_df

    

    train_features = X_train
    train_features_set = set(train_features)
    
    for feature in set(train_features):
        if feature not in test_df:
            train_features_set.remove(feature)
    train_features = list(train_features_set)
    
    X_train = X_train[train_features]
    X_test =  X_test[train_features]
    from sklearn.ensemble import RandomForestRegressor
    reg = RandomForestRegressor(max_depth = 6,random_state=0).fit(X_train, y_train)
    ypred = reg.predict(X_test)

    
    # Integrate with reloaded test data
    test_df1 = pd.read_csv('readonly/test.csv', encoding = "ISO-8859-1")
    test_df1['compliance'] = ypred
    test_df1.set_index('ticket_id', inplace=True)
    return test_df1.compliance

In [None]:
 train_df['hearing_date'] = pd.to_datetime(train_df['hearing_date'])
    train_df['ticket_issued_date'] = pd.to_datetime(train_df['ticket_issued_date'])
    test_df['hearing_date'] = pd.to_datetime(test_df['hearing_date'])
    test_df['ticket_issued_date'] = pd.to_datetime(test_df['ticket_issued_date'])
    
    train_df['hearing_date'].fillna(method = 'pad',inplace = True)
    train_df['ticket_issued_date'].fillna(method = 'pad',inplace = True)
    test_df['hearing_date'].fillna(method = 'pad',inplace = True)
    test_df['ticket_issued_date'].fillna(method = 'pad',inplace = True)
    
    train_df['time_gap'] = train_df['hearing_date'].subtract(train_df['ticket_issued_date'])
    train_df['time_gap'] = train_df['time_gap'].dt.days
    train_df['time_gap'].fillna(73,inplace = True)
    train_df.drop(['hearing_date','ticket_issued_date'],axis = 1,inplace = True)
    
    test_df['time_gap'] = test_df['hearing_date'].subtract(test_df['ticket_issued_date'])
    test_df['time_gap'] = test_df['time_gap'].dt.days
    test_df['time_gap'].fillna(73,inplace = True)
    test_df.drop(['hearing_date','ticket_issued_date'],axis = 1,inplace = True)