In [2]:
# System modules
import sys
import random
import time

# Data Analysis and Modeling modules
import sklearn
import pandas as pd
import numpy as np
import scipy as sp

# modeling algorithms
from sklearn import (svm, 
                     tree, 
                     linear_model, 
                     neighbors, 
                     naive_bayes, 
                     ensemble, 
                     discriminant_analysis, 
                     gaussian_process)

# import xgboost

# helper methods
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics


In [4]:
train_data_raw = pd.read_csv('./data/processed_train_data.csv')
test_data = pd.read_csv('./data/processed_test_data.csv')
# train_data_raw.head()
# Create a copy of the data to work on
train_data = train_data_raw.copy(deep=True)
data_cleaner = [train_data, test_data]

In [3]:
# print(train_data[train_data['Category'].unique().tolist()].head(5))
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 38 columns):
Unnamed: 0        878049 non-null int64
Dates             878049 non-null object
Category          878049 non-null object
Descript          878049 non-null object
DayOfWeek         878049 non-null object
PdDistrict        878049 non-null object
Resolution        878049 non-null object
Address           878049 non-null object
X                 878049 non-null float64
Y                 878049 non-null float64
Datetime_Dates    878049 non-null object
Workhour          878049 non-null int64
isHoliday         878049 non-null int64
Month             878049 non-null int64
isSummer          878049 non-null int64
isWinter          878049 non-null int64
isAutumn          878049 non-null int64
isSpring          878049 non-null int64
BAYVIEW           878049 non-null int64
CENTRAL           878049 non-null int64
INGLESIDE         878049 non-null int64
MISSION           878049 non-null int

In [5]:
train_data = pd.concat([train_data, pd.get_dummies(train_data['Category'])], axis=1)
data_cleaner = [train_data, test_data]

In [59]:
label = LabelEncoder()
train_data['category_Code'] = label.fit_transform(train_data['Category'])
train_data['category_Code'] = train_data['category_Code'].map(lambda x: x+1)

In [63]:
print(np.max(train_data['category_Code'].unique()))
print(np.min(train_data['category_Code'].unique()))

39
1


In [64]:
train_data['Address_Code'] = label.fit_transform(train_data['Address'])

In [65]:
features = np.append(train_data['PdDistrict'].unique(), ['isSummer', 'isAutumn', 'isWinter', 'isSpring'])
features = np.append(features, ['Workhour', 'isHoliday', 'Hour', 'Year', 'Month', 'Day', 'X', 'Y', 'Address_Code'])
features = np.append(features, train_data['DayOfWeek'].unique())
print(features)

targets_dummy = train_data['Category'].unique()

['NORTHERN' 'PARK' 'INGLESIDE' 'BAYVIEW' 'RICHMOND' 'CENTRAL' 'TARAVAL'
 'TENDERLOIN' 'MISSION' 'SOUTHERN' 'isSummer' 'isAutumn' 'isWinter'
 'isSpring' 'Workhour' 'isHoliday' 'Hour' 'Year' 'Month' 'Day' 'X' 'Y'
 'Address_Code' 'Wednesday' 'Tuesday' 'Monday' 'Sunday' 'Saturday'
 'Friday' 'Thursday']


In [66]:
print(len(features))
print(len(targets_dummy))

30
39


In [67]:
small_train_data = train_data.sample(100000)

In [68]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(small_train_data[features], 
                                                                    small_train_data['category_Code'], 
                                                                    random_state=0)
train_X_dummy, test_X_dummy, train_y_dummy, test_y_dummy = model_selection.train_test_split(small_train_data[features], 
                                                                   small_train_data[targets_dummy], 
                                                                   random_state=0)

In [69]:
print('Total shape:', small_train_data.shape)
print('Train shape:', train_X.shape, train_y.shape)
print('Test shape:', test_X.shape, test_y.shape)

Total shape: (100000, 79)
Train shape: (75000, 30) (75000,)
Test shape: (25000, 30) (25000,)


In [13]:
print(train_X.head())

        NORTHERN  PARK  INGLESIDE  BAYVIEW  RICHMOND  CENTRAL  TARAVAL  \
36984          0     0          0        0         0        0        0   
10221          0     0          0        0         0        1        0   
637200         0     0          1        0         0        0        0   
220119         0     0          0        0         0        0        0   
822156         0     0          1        0         0        0        0   

        TENDERLOIN  MISSION  SOUTHERN    ...              X          Y  \
36984            1        0         0    ...    -122.409524  37.785760   
10221            0        0         0    ...    -122.416793  37.796019   
637200           0        0         0    ...    -122.409328  37.722801   
220119           0        0         1    ...    -122.390136  37.789481   
822156           0        0         0    ...    -122.414583  37.708936   

        Address_Code  Wednesday  Tuesday  Monday  Sunday  Saturday  Friday  \
36984           5967          0 

# Model Training

In [17]:
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
#     ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
#     gaussian_process.GaussianProcessClassifier(),
    
    #GLM
#     linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
#     svm.SVC(probability=True),
#     svm.NuSVC(probability=True),
#     svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
#     discriminant_analysis.QuadraticDiscriminantAnalysis(),
]

In [70]:
#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean'] #, 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)
# cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) 

In [18]:
row_index = 0
for alg in MLA:
    #set name and parameters
    MLA_name = alg.__class__.__name__
    print("ALG:", MLA_name)
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    alg.fit(train_X, train_y)
    print("done fitting")
    
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = alg.score(train_X, train_y)
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = alg.score(test_X, test_y)
    
    print("train acc:", MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'])
    print("test acc:", MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'])
    
    row_index+=1
    print("-"*50)

ALG: AdaBoostClassifier
done fitting
train acc: 0.21114666666666668
test acc: 0.21292
--------------------------------------------------
ALG: BaggingClassifier
done fitting
train acc: 0.9692933333333333
test acc: 0.22768
--------------------------------------------------
ALG: ExtraTreesClassifier
done fitting
train acc: 0.9822533333333333
test acc: 0.1966
--------------------------------------------------
ALG: RandomForestClassifier
done fitting
train acc: 0.9709066666666667
test acc: 0.2186
--------------------------------------------------
ALG: PassiveAggressiveClassifier
done fitting
train acc: 0.10825333333333333
test acc: 0.10924
--------------------------------------------------
ALG: RidgeClassifierCV
done fitting
train acc: 0.2294
test acc: 0.2306
--------------------------------------------------
ALG: SGDClassifier
done fitting
train acc: 0.19312
test acc: 0.19652
--------------------------------------------------
ALG: Perceptron
done fitting
train acc: 0.019413333333333335
tes



done fitting
train acc: 0.23110666666666665
test acc: 0.23284
--------------------------------------------------




In [19]:
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
print(MLA_compare)

                         MLA Name  \
13     LinearDiscriminantAnalysis   
5               RidgeClassifierCV   
1               BaggingClassifier   
8                     BernoulliNB   
3          RandomForestClassifier   
0              AdaBoostClassifier   
2            ExtraTreesClassifier   
6                   SGDClassifier   
10           KNeighborsClassifier   
11         DecisionTreeClassifier   
12            ExtraTreeClassifier   
9                      GaussianNB   
4     PassiveAggressiveClassifier   
7                      Perceptron   
14  QuadraticDiscriminantAnalysis   

                                       MLA Parameters MLA Train Accuracy Mean  \
13  {'n_components': None, 'priors': None, 'shrink...                0.231107   
5   {'alphas': (0.1, 1.0, 10.0), 'class_weight': N...                  0.2294   
1   {'base_estimator': None, 'bootstrap': True, 'b...                0.969293   
8   {'alpha': 1.0, 'binarize': 0.0, 'class_prior':...                0.221773   
3 

In [22]:
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )
param_grid = {'criterion': ['gini', 'entropy'],  #scoring methodology; two supported formulas for calculating information gain - default is gini
              #'splitter': ['best', 'random'], #splitting methodology; two supported strategies - default is best
              'max_depth': [2,4,6,8,10,None], #max depth tree can grow; default is none
              #'min_samples_split': [2,5,10,.03,.05], #minimum subset size BEFORE new split (fraction is % of total); default is 2
              #'min_samples_leaf': [1,5,10,.03,.05], #minimum subset size AFTER new split split (fraction is % of total); default is 1
              #'max_features': [None, 'auto'], #max features to consider when performing split; default none or all
              'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
             }

In [85]:
tune_model = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), param_grid=param_grid)
tune_model.fit(train_X, train_y)



GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 4, 6, 8, 10, None], 'random_state': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [86]:
print("Training Score: (after) {:.2f}".format(tune_model.score(train_X, train_y)))
print("Testing Score: (after) {:.2f}".format(tune_model.score(test_X, test_y)))

Training Score: (after) 0.29
Testing Score: (after) 0.25


# Log Loss / score

In [89]:
predictions = tune_model.predict_proba(train_data[features])

# print("Train Accuracy:", model3.score(train_X, train_y))
# print("Test Accuracy:", model3.score(test_X, test_y))


In [90]:
print(metrics.log_loss(train_data['category_Code'],predictions))

2.5078220703707275
