In [5]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn import preprocessing

def blight_model():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    
    #reading train dataset
    df_train = pd.read_csv("train.csv", encoding="ISO-8859-1", low_memory=False)
    df_test = pd.read_csv("test.csv", encoding="ISO-8859-1", low_memory=False)
    df_addresses = pd.read_csv("addresses.csv", encoding="ISO-8859-1", low_memory=False)
    df_latlons = pd.read_csv("latlons.csv", encoding="ISO-8859-1", low_memory=False)
    
    #getting the common features into a list
    commons_test = list(set(df_train.columns).intersection(df_test.columns))
    commons_train = commons_test + ['compliance']

    df_train = df_train[commons_train]
    df_test = df_test[commons_test]
        
    #join 'addresses' and 'latlons' on 'address' column
    df_joined = df_addresses.set_index('address').join(df_latlons.set_index('address'),
                                                       how='outer')
    
    #join 'joined' on 'train' and 'test', on 'ticket_id' as index
    df_train = df_train.set_index('ticket_id').join(df_joined.set_index('ticket_id'),
                                                   how='inner')
    df_test = df_test.set_index('ticket_id').join(df_joined.set_index('ticket_id'),
                                                   how='inner')
    
    #cleaning datasets  
    df_train.dropna(subset=['compliance'], how='any', inplace=True)
    
    df_train.dropna(subset=['compliance'], inplace=True)
    df_train['compliance'] = df_train['compliance'].astype('int32')
    
    df_train[['compliance']] = df_train[['compliance']].astype('float64')

    #check for full nulls column
    #for col in df_test.columns:
    #    print(col,'\t', df_train[col].isnull().sum())
    #    print(col,'\t', df_test[col].isnull().sum())
    
    #transform date columns delta into float, 'hearing_date' and 'payment_date'

    df_train['hearing_date'] = pd.to_datetime(df_train['hearing_date'])
    df_train['ticket_issued_date'] = pd.to_datetime(df_train['ticket_issued_date'])
    df_train['days'] = df_train['hearing_date'] - df_train['ticket_issued_date']
    df_train.days = (df_train.days.dt.days).astype('float64')
    df_train.drop(['ticket_issued_date', 'hearing_date'], axis=1, inplace=True)
    
    #for test data too
    df_test['hearing_date'] = pd.to_datetime(df_test['hearing_date'])
    df_test['ticket_issued_date'] = pd.to_datetime(df_test['ticket_issued_date'])
    df_test['days'] = df_test['hearing_date'] - df_test['ticket_issued_date']
    df_test.days = (df_test.days.dt.days).astype('float64')
    df_test.drop(['ticket_issued_date','hearing_date'], axis=1, inplace=True)
    
    to_drop = ['admin_fee', 'state_fee', 'violation_zip_code',
              'grafitti_status', 'non_us_str_code', 'inspector_name',
              'violator_name', 'country', 'discount_amount', 'violation_street_number',
              'mailing_address_str_number']
    
    df_train.drop(to_drop, axis=1, inplace=True)
    df_test.drop(to_drop, axis=1, inplace=True)
    
    #drop NaN values in train dataset and replace NaNs in test dataset
    df_train.dropna(inplace=True)
    df_test.fillna(value=0, inplace=True)
    
    #getting non numerical columns
    objs = []
    for col in df_test.columns:
        if df_test[col].dtype == 'object':
            objs.append(col)
            
    #for col in df_test.columns:
    #    print(col,'\t', df_train[col].isnull().sum())
    #    print(col,'\t', df_test[col].isnull().sum())
    
    for col in df_train.columns:
        print(df_train[col].dtype)
    
    print(df_train.head())
    
    #convert categorical variable into integers
    
    """
    LE = preprocessing.LabelEncoder()
    
    for obj in objs:
        df_train[obj] = LE.fit_transform(df_train[obj])
        df_test[obj] = LE.fit_transform(df_test[obj])
    
    #print('Train skew:', '\n', df_train.skew(), '\n')
    #print('Train columns:', '\n', df_train.columns, '\n')
    #print('Test skew:', '\n', df_test.skew())
    #print('Test columns:', '\n', df_test.columns, '\n')
    
    
    for col in df_train.columns:
        df_train[col] = df_train[col].astype('float64')
    
    for col in df_test.columns:
        df_test[col] = df_test[col].astype('float64')
    """    
    #reorganizing train cols
    train_cols = ['city', 
                  'clean_up_cost', 
                  'violation_description',
                  'judgment_amount',
                  'mailing_address_str_name',
                  'zip_code', 
                  'fine_amount', 
                  'agency_name',
                  'disposition',
                  'violation_code', 
                  'late_fee', 
                  'state',
                  'violation_street_name', 
                  'lat',
                  'lon', 
                  'days', 
                  'compliance']
    
    df_train = df_train[train_cols]
    #print(df_train.columns)
    
    """
    from scipy import stats
    for col in df_train.columns[0:len(df_train.columns)-1]:
        df_train[col] = stats.boxcox(df_train[col] + 1 - np.min(df_train[col]))[0]
    
    for col in df_test.columns:
        df_test[col] = stats.boxcox(df_test[col] + 1 - np.min(df_test[col]))[0]
        
    #print('De-skewed train: ', df_train.skew())
    #print('De-skewed test: ', df_test.skew())
    
    
    
    #compute feature importance using ExtraTreeClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    
    forest = ExtraTreesClassifier(n_estimators=50)
    feats_cols = df_train.columns[0:len(df_train.columns)-1]
    X_selection = df_train[feats_cols]
    y_selection = df_train['compliance']
    forest.fit(X_selection, y_selection)
    importances = forest.feature_importances_
    new_cols = df_train.columns[np.argsort(importances)[::-1]]   #the order of most important features
    #print(new_cols)
    """
    
    for i in range(len(new_cols) - 1):
        feats = new_cols[:i+1]
        X = df_train[feats]
        #print(X.columns)
        #X = df.loc[:, df.columns != 'compliance']
        #X.set_index('ticket_id', inplace=True)
        y = df_train['compliance']
        #print(X.head())
        cols_test = df_test.columns
        X_testF = df_test[cols_test]
        #X_testF.set_index('ticket_id', inplace=True)
        #print(X_testF.head())
        
        #splitting X, y
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

        #normalizing
        from sklearn.preprocessing import MinMaxScaler

        scaler = MinMaxScaler()
        X_train_scaler = scaler.fit_transform(X_train)
        X_test_scaler = scaler.fit_transform(X_test)
        X_test_scalerF = scaler.fit_transform(X_testF)

        #print(X_train_scaler.shape)
        #print(X_test_scaler.shape)
        #print(X_test_scalerF.shape)

        #set up the GradientBoostingClassifier
        from sklearn.ensemble import GradientBoostingClassifier
        from sklearn.metrics import roc_curve, auc
        from sklearn.linear_model import LogisticRegression
        from sklearn.naive_bayes import GaussianNB


        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
            max_depth=6, random_state=0).fit(X_train_scaler, y_train)


        #clf = LogisticRegression(C=2.0, 
                                 #class_weight='balanced',
                                 #random_state=0, 
                                 #solver='lbfgs', 
                                 #multi_class='ovr'
                                #)

        #clf = GaussianNB()
    

        """
        #GridSearchCV optimization
        #from sklearn.grid_search import GridSearchCV
        from sklearn.model_selection import GridSearchCV

        for i in range(0,5):
            randomly_sampled = df.ix[np.random.choice(df.index, 1000)]


        gb_grid_params = {'max_depth': [4, 6, 8, 10, 12, 14, 16],
                          'min_samples_split': [200, 400, 500, 700, 1000],
                  }

        gbc_grid = GridSearchCV(gbc,
                                gb_grid_params,
                                cv=5,
                                scoring='roc_auc', 
                                n_jobs=-1)


        #gbc_grid.fit(X_train_scaler, y_train)
        #print('Grid best AUC', gbc_grid.best_params_)
        #print('Best AUC is', gbc_grid.best_score)
        """

    
        y_score_clf = clf.fit(X_train_scaler, y_train).predict(X_test_scaler)
        fpr_clf, tpr_clf, _ = roc_curve(y_test, y_score_clf)
        roc_auc_clf = auc(fpr_clf, tpr_clf)
        print('AUC score is:', roc_auc_clf)
        
    #toreturn = clf.predict_proba(X_test_scalerF)
    #print(toreturn[:10,1])
    #answer = pd.Series(data=toreturn[:,1], index=df_test.index, dtype='float32', name=None, copy=False, fastpath=False)
    #answer.index = answer.index.astype('int32')
    #print(answer.head())
    #print(df_test.index)
    #print(toreturn)
    
    
    return (clf.score(X_train_scaler, y_train), clf.score(X_test_scaler, y_test))
    #return (gbc.score(X_train_scaler, y_train), gbc.score(X_test_scaler, y))
    #return 'none'
    #return answer

In [6]:
blight_model()

float64
object
object
object
object
float64
object
object
float64
object
object
float64
object
float64
float64
float64
float64
           fine_amount state violation_code violation_street_name  \
ticket_id                                                           
22056            250.0    IL      9-1-36(a)                 TYLER   
27586            750.0    MI     61-63.0600               CENTRAL   
22046            250.0    CA      9-1-36(a)            NORTHFIELD   
18738            750.0    MI     61-63.0500             BRENTWOOD   
18735            100.0    MI     61-63.0100            MT ELLIOTT   

                                              agency_name  judgment_amount  \
ticket_id                                                                    
22056      Buildings, Safety Engineering & Env Department            305.0   
27586      Buildings, Safety Engineering & Env Department            855.0   
22046      Buildings, Safety Engineering & Env Department            305.0   

NameError: name 'new_cols' is not defined