In [85]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def blight_model():
    
    train_df=pd.read_csv('/content/drive/MyDrive/Dataset/Blight/train.csv',  encoding='cp1252')
    test_df=pd.read_csv('/content/drive/MyDrive/Dataset/Blight/test.csv',  encoding='cp1252')
    df_a=pd.read_csv('/content/drive/MyDrive/Dataset/Blight/addresses.csv',  encoding='cp1252')
    df_g=pd.read_csv('/content/drive/MyDrive/Dataset/Blight/latlons.csv',  encoding='cp1252')

    train_df = train_df[(train_df['compliance'] == 1) | (train_df['compliance'] == 0)]
  
    d = train_df.groupby(['inspector_name', 'compliance']).size()
    data = d[:,1] / (d[:,0] + d[:,1])
    inspector = pd.DataFrame(data = data)
    inspector.fillna(0, inplace=True)
    inspector.rename(columns = {0:'inspector_effciency'}, inplace = True)
    train_df = train_df.set_index('inspector_name').join(inspector)
    test_df = test_df.set_index('inspector_name').join(inspector) 

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    d = train_df.groupby(['agency_name', 'compliance']).size()
    data = d[:,1] / (d[:,0] + d[:,1])
    agency = pd.DataFrame(data = data)
    agency.fillna(0, inplace=True)
    agency.rename(columns = {0:'agency_effciency'}, inplace = True)
    train_df = train_df.set_index('agency_name').join(agency)
    test_df = test_df.set_index('agency_name').join(agency)

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    address= df_a.set_index('address').join(df_g.set_index('address'), how='left')
    train_df = train_df.set_index('ticket_id').join(address.set_index('ticket_id'))
    test_df = test_df.set_index('ticket_id').join(address.set_index('ticket_id'))


    columns_to_drop_train = ['violator_name','inspector_name','agency_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state','non_us_str_code','country', 'ticket_issued_date',
       'violation_description','payment_amount','violation_code','balance_due','grafitti_status',
       'payment_date', 'payment_status', 'collection_status', 'compliance_detail','fine_amount', 'admin_fee', 'state_fee', 'late_fee',
       'discount_amount', 'clean_up_cost', 'disposition','hearing_date','zip_code',
       ]

    columns_to_drop_test = ['violator_name','inspector_name','agency_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number','violation_code','grafitti_status',
       'mailing_address_str_name', 'city', 'state','non_us_str_code','country', 'ticket_issued_date', 
       'violation_description', 'fine_amount', 'admin_fee', 'state_fee', 'late_fee','zip_code',
       'discount_amount', 'clean_up_cost','disposition', 'hearing_date']

    
    train_data =  train_df.drop(columns_to_drop_train, axis= 1)
    test_data =  test_df.drop(columns_to_drop_test, axis= 1)

    test_data.lat.fillna(method='pad', inplace=True)
    test_data.lon.fillna(method='pad', inplace=True)
    test_data.inspector_effciency.fillna(0, inplace=True)

    train_data.lat.fillna(method='pad', inplace=True)
    train_data.lon.fillna(method='pad', inplace=True)
 
    y = train_data['compliance']
    train_data.drop('compliance', axis =1, inplace=True)

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(train_data)
    X_eval_scaled = scaler.transform(test_data)
    
    X_train, X_test, y_train, y_test = train_test_split(X_train_scaled, y, test_size = 0.25, random_state = 42)
    
    params= {'learning_rate': [0.3, 1, 3], 'n_estimators':[50], 'max_depth':[3, 5, 8]}
    clf = GradientBoostingClassifier(random_state=0)
    gscv = GridSearchCV(estimator=clf, param_grid=params, scoring='roc_auc', cv=3, n_jobs=-1)
    gscv.fit(X_train, y_train)


    y_eval = gscv.predict_proba(X_eval_scaled)[:,1]
    predictions = pd.Series(y_eval, index = test_df.index)

    return predictions

In [86]:
blight_model()

  """Entry point for launching an IPython kernel.


ticket_id
285001    0.038109
285006    0.024533
286295    0.070909
286290    0.072708
286291    0.039111
            ...   
291280    0.826937
291279    0.826937
291277    0.826937
293244    0.090316
294214    0.116263
Length: 61001, dtype: float64