## SF-crime Kaggle competition

#### This is an attempt to build a standard tabular data model using Random Forest on Kaggle data

In [1]:
# Let's start by some standard code
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from IPython.display import display

from sklearn import metrics
from graphviz import *

In [6]:
PATH = "data/sf-crime/"

In [7]:
!ls {PATH}

result_v1.csv        result_version1.csv  sampleSubmission.csv
result_v2.csv        result_version2.csv  test.csv
result_v3.csv        result_version3.csv  train.csv


### 1. Let's load and explore the data

In [8]:
df_raw=pd.read_csv(f'{PATH}train.csv')

In [9]:
df_raw.columns

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'],
      dtype='object')

In [48]:
df_raw=pd.read_csv(f'{PATH}train.csv',parse_dates=['Dates'])

In [49]:
df_raw.shape

(878049, 9)

In [50]:
display(df_raw.head())

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [51]:
display(df_raw.isnull().sum().sort_index()/len(df_raw))

Address       0.0
Category      0.0
Dates         0.0
DayOfWeek     0.0
Descript      0.0
PdDistrict    0.0
Resolution    0.0
X             0.0
Y             0.0
dtype: float64

### What we see in the data
1. One date field. So the date formatting and feature extraction is required.
2. A few categorical variables. We will have to convert these into numbers.
3. No missing values.

### Let's explore the response variable (y)

In [52]:
df_raw['Category'].unique()

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT', 'VANDALISM', 'NON-CRIMINAL',
       'ROBBERY', 'ASSAULT', 'WEAPON LAWS', 'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY', 'SECONDARY CODES', 'TRESPASS',
       'MISSING PERSON', 'FRAUD', 'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT', 'ARSON', 'FAMILY OFFENSES',
       'LIQUOR LAWS', 'BRIBERY', 'EMBEZZLEMENT', 'SUICIDE', 'LOITERING', 'SEX OFFENSES NON FORCIBLE',
       'EXTORTION', 'GAMBLING', 'BAD CHECKS', 'TREA', 'RECOVERED VEHICLE', 'PORNOGRAPHY/OBSCENE MAT'],
      dtype=object)

In [53]:
df_raw.groupby('Category').count().sort_values('Dates')

Unnamed: 0_level_0,Dates,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TREA,6,6,6,6,6,6,6,6
PORNOGRAPHY/OBSCENE MAT,22,22,22,22,22,22,22,22
GAMBLING,146,146,146,146,146,146,146,146
SEX OFFENSES NON FORCIBLE,148,148,148,148,148,148,148,148
EXTORTION,256,256,256,256,256,256,256,256
BRIBERY,289,289,289,289,289,289,289,289
BAD CHECKS,406,406,406,406,406,406,406,406
FAMILY OFFENSES,491,491,491,491,491,491,491,491
SUICIDE,508,508,508,508,508,508,508,508
EMBEZZLEMENT,1166,1166,1166,1166,1166,1166,1166,1166


Some categories seem very rare to occur. 

In [54]:
# The 0.1% of the data would be:
len(df_raw)*0.001

878.049

We could try to build the model by eliminating category count less than 878 for now. Later we can revisit these.

In [55]:
df_raw.groupby('Category').count().sort_values('Dates').head(10)

Unnamed: 0_level_0,Dates,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TREA,6,6,6,6,6,6,6,6
PORNOGRAPHY/OBSCENE MAT,22,22,22,22,22,22,22,22
GAMBLING,146,146,146,146,146,146,146,146
SEX OFFENSES NON FORCIBLE,148,148,148,148,148,148,148,148
EXTORTION,256,256,256,256,256,256,256,256
BRIBERY,289,289,289,289,289,289,289,289
BAD CHECKS,406,406,406,406,406,406,406,406
FAMILY OFFENSES,491,491,491,491,491,491,491,491
SUICIDE,508,508,508,508,508,508,508,508
EMBEZZLEMENT,1166,1166,1166,1166,1166,1166,1166,1166


In [56]:
## Skip this step
#rare_cats = set(['SUICIDE','FAMILY OFFENSES', 'BAD CHECKS', 'BRIBERY', 'EXTORTION',
#       'SEX OFFENSES NON FORCIBLE', 'GAMBLING', 'PORNOGRAPHY/OBSCENE MAT',
#       'TREA'])
#all_cats = set(df_raw['Category'].unique())
#common_cats = all_cats-rare_cats
#df_raw = df_raw[df_raw['Category'].isin(common_cats)]
#train = df_raw.reset_index(drop = True)

In [57]:
# Old len was 878049, let's check the new len without rare_cats
len(df_raw)

878049

In [58]:
df_raw["Category"].unique()

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT', 'VANDALISM', 'NON-CRIMINAL',
       'ROBBERY', 'ASSAULT', 'WEAPON LAWS', 'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY', 'SECONDARY CODES', 'TRESPASS',
       'MISSING PERSON', 'FRAUD', 'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT', 'ARSON', 'FAMILY OFFENSES',
       'LIQUOR LAWS', 'BRIBERY', 'EMBEZZLEMENT', 'SUICIDE', 'LOITERING', 'SEX OFFENSES NON FORCIBLE',
       'EXTORTION', 'GAMBLING', 'BAD CHECKS', 'TREA', 'RECOVERED VEHICLE', 'PORNOGRAPHY/OBSCENE MAT'],
      dtype=object)

In [59]:
len(df_raw["Category"].unique())

39

### 2. Format the data

In [60]:
df_raw.dtypes

Dates         datetime64[ns]
Category              object
Descript              object
DayOfWeek             object
PdDistrict            object
Resolution            object
Address               object
X                    float64
Y                    float64
dtype: object

We would not see the codes in the actual dataframe but we can check using the following code.

## Feature Engineering

### Feature engineering on date-time

In [61]:
# You should always consider this feature extraction step when working with date-time.
add_datepart(df_raw,fldname='Dates',time=True)
df_raw.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,DatesYear,DatesMonth,...,DatesIs_month_end,DatesIs_month_start,DatesIs_quarter_end,DatesIs_quarter_start,DatesIs_year_end,DatesIs_year_start,DatesHour,DatesMinute,DatesSecond,DatesElapsed
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,...,False,False,False,False,False,False,23,53,0,1431561180
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,...,False,False,False,False,False,False,23,53,0,1431561180
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,...,False,False,False,False,False,False,23,33,0,1431559980
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,...,False,False,False,False,False,False,23,30,0,1431559800
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,...,False,False,False,False,False,False,23,30,0,1431559800


### Feature Engineering on X, Y

In [62]:
# Lets' created scaled transform of the X and Y coordinates
xy_scaler = StandardScaler()
xy_scaler.fit(df_raw.loc[:,['X','Y']])
xy_scaled = xy_scaler.transform(df_raw.loc[:,['X','Y']])
type(xy_scaled), np.shape(xy_scaled), xy_scaled

(numpy.ndarray, (878049, 2), array([[-0.1079 ,  0.00783],
        [-0.1079 ,  0.00783],
        [-0.05754,  0.06433],
        ...,
        [ 0.6334 ,  0.02024],
        [ 1.05704,  0.02098],
        [ 0.91227, -0.07181]]))

In [63]:
# PCA on X and Y, select top 2 components
xy_pca = PCA(n_components=2, whiten=True).fit_transform(xy_scaled)
type(xy_pca), np.shape(xy_pca), xy_pca

(numpy.ndarray, (878049, 2), array([[-0.05667,  0.12328],
        [-0.05667,  0.12328],
        [ 0.00385,  0.12982],
        ...,
        [ 0.37013, -0.65315],
        [ 0.61044, -1.10361],
        [ 0.47592, -1.04824]]))

In [64]:
# Rotation
rot45_X = .707* xy_scaled[:,1] + .707* xy_scaled[:,0] 
rot45_Y = .707* xy_scaled[:,1] - .707* xy_scaled[:,0]

rot30_X = (1.732/2)* xy_scaled[:,0] + (1./2)* xy_scaled[:,1]
rot30_Y = (1.732/2)* xy_scaled[:,1] - (1./2)* xy_scaled[:,0]

rot60_X = (1./2)* xy_scaled[:,0] + (1.732/2)* xy_scaled[:,1] 
rot60_Y = (1./2)* xy_scaled[:,1] - (1.732/2)* xy_scaled[:,0]

len(rot45_X),len(rot60_Y)

(878049, 878049)

The first feature that we engineer is obtained by standardizing and pca-ing latitude and longitude

In [65]:
# Polar coordinates
radial_r = np.sqrt( np.power(xy_scaled[:,1],2) + np.power(xy_scaled[:,0],2) )
radial_phi = np.arctan2(xy_scaled[:,1], xy_scaled[:,0])

#### Let's add these to the main df

In [66]:
df_raw_backup=df_raw
xy_scaled_df=pd.DataFrame(xy_scaled)
xy_pca_df=pd.DataFrame(xy_pca)

In [67]:
df_raw['xy_scaled_1']    = xy_scaled_df[0]
df_raw['xy_scaled_2']    = xy_scaled_df[1]
df_raw['xy_pca_1']       = xy_pca_df[0]
df_raw['xy_pca_2']       = xy_pca_df[1]
df_raw['xy_x_rotate_30'] = rot30_X
df_raw['xy_x_rotate_45'] = rot45_X
df_raw['xy_x_rotate_60'] = rot60_X
df_raw['xy_y_rotate_30'] = rot30_Y
df_raw['xy_y_rotate_45'] = rot45_Y
df_raw['xy_y_rotate_60'] = rot60_Y
df_raw['xy_radial_r']    = radial_r
df_raw['xy_radial_phi']  = radial_phi

In [68]:
df_raw.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,DatesYear,DatesMonth,...,xy_pca_1,xy_pca_2,xy_x_rotate_30,xy_x_rotate_45,xy_x_rotate_60,xy_y_rotate_30,xy_y_rotate_45,xy_y_rotate_60,xy_radial_r,xy_radial_phi
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,...,-0.056666,0.12328,-0.089527,-0.07075,-0.047169,0.060733,0.081824,0.097359,0.108186,3.069137
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,...,-0.056666,0.12328,-0.089527,-0.07075,-0.047169,0.060733,0.081824,0.097359,0.108186,3.069137
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,...,0.003847,0.129822,-0.017663,0.004803,0.026943,0.084484,0.086166,0.081998,0.086313,2.300507
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,...,-0.044692,0.223266,-0.092262,-0.055799,-0.015549,0.128713,0.148187,0.1576,0.158368,2.716321
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,...,-0.300101,0.566956,-0.459373,-0.37469,-0.264569,0.266543,0.376302,0.460513,0.531113,3.139446


### Feature Engineering on address

In [69]:
df_raw['Address'].head(20)

0                OAK ST / LAGUNA ST
1                OAK ST / LAGUNA ST
2         VANNESS AV / GREENWICH ST
3          1500 Block of LOMBARD ST
4         100 Block of BRODERICK ST
5               0 Block of TEDDY AV
6               AVALON AV / PERU AV
7          KIRKWOOD AV / DONAHUE ST
8              600 Block of 47TH AV
9     JEFFERSON ST / LEAVENWORTH ST
10    JEFFERSON ST / LEAVENWORTH ST
11            0 Block of ESCOLTA WY
12               TURK ST / JONES ST
13           FILLMORE ST / GEARY BL
14         200 Block of WILLIAMS AV
15            0 Block of MENDELL ST
16               EDDY ST / JONES ST
17           GODEUS ST / MISSION ST
18           MENDELL ST / HUDSON AV
19            100 Block of JONES ST
Name: Address, dtype: object

In [70]:
# Create a feature for street intersection
df_raw['ad_contains_intersection']=df_raw['Address'].str.contains('/')
df_raw[['ad_contains_intersection','Address']].tail()

Unnamed: 0,ad_contains_intersection,Address
878044,True,FARALLONES ST / CAPITOL AV
878045,False,600 Block of EDNA ST
878046,True,5TH ST / FOLSOM ST
878047,True,TOWNSEND ST / 2ND ST
878048,False,1800 Block of NEWCOMB AV


In [71]:
# Create a feature for block or no-block in the address
add_upper=df_raw['Address'].str.upper()
df_raw['ad_contains_block']=add_upper.str.contains('BLOCK')
df_raw[['ad_contains_block','Address']].head()

Unnamed: 0,ad_contains_block,Address
0,False,OAK ST / LAGUNA ST
1,False,OAK ST / LAGUNA ST
2,False,VANNESS AV / GREENWICH ST
3,True,1500 Block of LOMBARD ST
4,True,100 Block of BRODERICK ST


In [72]:
# Create a feature to capture the data before and after the '/'
add_p1, add_p2 = df_raw['Address'].str.split('/', 1).str

add_p1=add_p1.str.rstrip()
df_raw['add_p1']=add_p1.str.lstrip()

add_p2=add_p2.str.rstrip()
df_raw['add_p2']=add_p2.str.lstrip()

# Check if the split went well
df_raw[['Address','add_p1','add_p2']].head(15)

Unnamed: 0,Address,add_p1,add_p2
0,OAK ST / LAGUNA ST,OAK ST,LAGUNA ST
1,OAK ST / LAGUNA ST,OAK ST,LAGUNA ST
2,VANNESS AV / GREENWICH ST,VANNESS AV,GREENWICH ST
3,1500 Block of LOMBARD ST,1500 Block of LOMBARD ST,
4,100 Block of BRODERICK ST,100 Block of BRODERICK ST,
5,0 Block of TEDDY AV,0 Block of TEDDY AV,
6,AVALON AV / PERU AV,AVALON AV,PERU AV
7,KIRKWOOD AV / DONAHUE ST,KIRKWOOD AV,DONAHUE ST
8,600 Block of 47TH AV,600 Block of 47TH AV,
9,JEFFERSON ST / LEAVENWORTH ST,JEFFERSON ST,LEAVENWORTH ST


In [73]:
# Type of the first part of address (ST, AV BL etc)
df_raw['add_p1_type']=df_raw['add_p1'].str[-2:]
np.unique(df_raw['add_p1_type'])

array(['80', 'AL', 'AR', 'AV', 'AY', 'BL', 'CR', 'CT', 'DR', 'ER', 'EX', 'HY', 'LN', 'MS', 'NO', 'PL', 'PZ',
       'RD', 'RK', 'RW', 'ST', 'TI', 'TR', 'WK', 'WY'], dtype=object)

In [33]:
# Type of the second part of address (ST, AV BL etc)
# Need to do a vectorized version of this.

#add_p2_type = []

#for i in range(1,len(add_p2)-1):
#    print(i)
#    try:
#        val=add_intersection[i]
#    except:
#        continue
#            
#    if val==True:        
#        try:
#            add_p2_type.append(add_p2.str[-2:])
#        except:
#            add_p2_type.append('None')
#    else:
#        add_p2_type.append('None')

### Creating df_train 

In [74]:
# Let's confirm the shapes 
df_raw.shape

(878049, 41)

In [75]:
df_raw.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,DatesYear,DatesMonth,...,xy_y_rotate_30,xy_y_rotate_45,xy_y_rotate_60,xy_radial_r,xy_radial_phi,ad_contains_intersection,ad_contains_block,add_p1,add_p2,add_p1_type
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,...,0.060733,0.081824,0.097359,0.108186,3.069137,True,False,OAK ST,LAGUNA ST,ST
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,...,0.060733,0.081824,0.097359,0.108186,3.069137,True,False,OAK ST,LAGUNA ST,ST
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,...,0.084484,0.086166,0.081998,0.086313,2.300507,True,False,VANNESS AV,GREENWICH ST,AV
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,...,0.128713,0.148187,0.1576,0.158368,2.716321,False,True,1500 Block of LOMBARD ST,,ST
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,...,0.266543,0.376302,0.460513,0.531113,3.139446,False,True,100 Block of BRODERICK ST,,ST


In [76]:
df_raw.tail()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,DatesYear,DatesMonth,...,xy_y_rotate_30,xy_y_rotate_45,xy_y_rotate_60,xy_radial_r,xy_radial_phi,ad_contains_intersection,ad_contains_block,add_p1,add_p2,add_p1_type
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,2003,1,...,0.491903,0.760074,0.976642,1.206208,-3.038045,True,False,FARALLONES ST,CAPITOL AV,ST
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,2003,1,...,0.333587,0.51595,0.663284,0.819765,-3.037082,False,True,600 Block of EDNA ST,,ST
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.40339,37.780266,2003,1,...,-0.299178,-0.43351,-0.53841,0.633727,0.031936,True,False,5TH ST,FOLSOM ST,ST
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,2003,1,...,-0.51035,-0.732494,-0.904908,1.057251,0.019848,True,False,TOWNSEND ST,2ND ST,ST
878048,FORGERY/COUNTERFEITING,"CHECKS, FORGERY (FELONY)",Monday,BAYVIEW,NONE,1800 Block of NEWCOMB AV,-122.394926,37.738212,2003,1,...,-0.518322,-0.695745,-0.825931,0.915093,-0.078552,False,True,1800 Block of NEWCOMB AV,,AV


In [77]:
df_raw.columns

Index(['Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution',
       'Address', 'X', 'Y', 'DatesYear', 'DatesMonth', 'DatesWeek', 'DatesDay',
       'DatesDayofweek', 'DatesDayofyear', 'DatesIs_month_end',
       'DatesIs_month_start', 'DatesIs_quarter_end', 'DatesIs_quarter_start',
       'DatesIs_year_end', 'DatesIs_year_start', 'DatesHour', 'DatesMinute',
       'DatesSecond', 'DatesElapsed', 'xy_scaled_1', 'xy_scaled_2', 'xy_pca_1',
       'xy_pca_2', 'xy_x_rotate_30', 'xy_x_rotate_45', 'xy_x_rotate_60',
       'xy_y_rotate_30', 'xy_y_rotate_45', 'xy_y_rotate_60', 'xy_radial_r',
       'xy_radial_phi', 'ad_contains_intersection', 'ad_contains_block',
       'add_p1', 'add_p2', 'add_p1_type'],
      dtype='object')

In [78]:
df_raw[['DayOfWeek','DatesDayofweek']].head()

Unnamed: 0,DayOfWeek,DatesDayofweek
0,Wednesday,2
1,Wednesday,2
2,Wednesday,2
3,Wednesday,2
4,Wednesday,2


Let's delete the duplicate column.

In [79]:
df_raw.drop('DayOfWeek',axis=1,inplace=True)

Also, the "Descript" and "Resolution" columns are part of only training dataset and not the target variable. So let's drop that as well.

In [80]:
df_raw.drop(['Descript','Resolution'],axis=1,inplace=True)

In [81]:
df_raw.dtypes

Category                     object
PdDistrict                   object
Address                      object
X                           float64
Y                           float64
DatesYear                     int64
DatesMonth                    int64
DatesWeek                     int64
DatesDay                      int64
DatesDayofweek                int64
DatesDayofyear                int64
DatesIs_month_end              bool
DatesIs_month_start            bool
DatesIs_quarter_end            bool
DatesIs_quarter_start          bool
DatesIs_year_end               bool
DatesIs_year_start             bool
DatesHour                     int64
DatesMinute                   int64
DatesSecond                   int64
DatesElapsed                  int64
xy_scaled_1                 float64
xy_scaled_2                 float64
xy_pca_1                    float64
xy_pca_2                    float64
xy_x_rotate_30              float64
xy_x_rotate_45              float64
xy_x_rotate_60              

Perhaps no explicit datatype conversions are required into a category type.

In [82]:
#df_raw.PdDistrict.cat

In [83]:
train_cats(df_raw)

In [84]:
df_raw.PdDistrict.cat.codes.head()
# Before calling "train_cats" we got error for this, but now as we can see the codes are ready.

0    4
1    4
2    4
3    4
4    5
dtype: int8

In [85]:
# Now although we have generated codes for Categorical variables we havn't used them in the orginal dataframe
df_raw.head()

Unnamed: 0,Category,PdDistrict,Address,X,Y,DatesYear,DatesMonth,DatesWeek,DatesDay,DatesDayofweek,...,xy_y_rotate_30,xy_y_rotate_45,xy_y_rotate_60,xy_radial_r,xy_radial_phi,ad_contains_intersection,ad_contains_block,add_p1,add_p2,add_p1_type
0,WARRANTS,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,20,13,2,...,0.060733,0.081824,0.097359,0.108186,3.069137,True,False,OAK ST,LAGUNA ST,ST
1,OTHER OFFENSES,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,20,13,2,...,0.060733,0.081824,0.097359,0.108186,3.069137,True,False,OAK ST,LAGUNA ST,ST
2,OTHER OFFENSES,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,20,13,2,...,0.084484,0.086166,0.081998,0.086313,2.300507,True,False,VANNESS AV,GREENWICH ST,AV
3,LARCENY/THEFT,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,20,13,2,...,0.128713,0.148187,0.1576,0.158368,2.716321,False,True,1500 Block of LOMBARD ST,,ST
4,LARCENY/THEFT,PARK,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,20,13,2,...,0.266543,0.376302,0.460513,0.531113,3.139446,False,True,100 Block of BRODERICK ST,,ST


In [87]:
## Skip this step
rare_cats = set(['SUICIDE','FAMILY OFFENSES', 'BAD CHECKS', 'BRIBERY', 'EXTORTION',
       'SEX OFFENSES NON FORCIBLE', 'GAMBLING', 'PORNOGRAPHY/OBSCENE MAT',
       'TREA'])
all_cats = set(df_raw['Category'].unique())
common_cats = all_cats-rare_cats
#df_raw = df_raw[df_raw['Category'].isin(common_cats)]
#train = df_raw.reset_index(drop = True)

In [91]:
common_cat_idx = df_raw['Category'].isin(common_cats)

In [92]:
# We can add fix_missing as one of the parameter. 
# It works only on numeric, as pandas automatically takes care of categorical variable missing values 
# by setting them to -1.
df_x, y, nas = proc_df(df_raw,'Category')

In [93]:
# Now it should all be numeric or boolean
df_x.head()

Unnamed: 0,PdDistrict,Address,X,Y,DatesYear,DatesMonth,DatesWeek,DatesDay,DatesDayofweek,DatesDayofyear,...,xy_y_rotate_30,xy_y_rotate_45,xy_y_rotate_60,xy_radial_r,xy_radial_phi,ad_contains_intersection,ad_contains_block,add_p1,add_p2,add_p1_type
0,5,19791,-122.425892,37.774599,2015,5,20,13,2,133,...,0.060733,0.081824,0.097359,0.108186,3.069137,True,False,12045,883,21
1,5,19791,-122.425892,37.774599,2015,5,20,13,2,133,...,0.060733,0.081824,0.097359,0.108186,3.069137,True,False,12045,883,21
2,5,22698,-122.424363,37.800414,2015,5,20,13,2,133,...,0.084484,0.086166,0.081998,0.086313,2.300507,True,False,12461,693,4
3,5,4267,-122.426995,37.800873,2015,5,20,13,2,133,...,0.128713,0.148187,0.1576,0.158368,2.716321,False,True,4131,0,21
4,6,1844,-122.438738,37.771541,2015,5,20,13,2,133,...,0.266543,0.376302,0.460513,0.531113,3.139446,False,True,1844,0,21


In [94]:
y[:10]

array([37, 21, 21, 16, 16, 16, 36, 36, 16, 16], dtype=int8)

In [95]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
       25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38], dtype=int8)

In [96]:
# Let's check the encoding details
len(df_x),len(df_raw)

(878049, 878049)

In [103]:
# Remove less common categories
train = df_x[common_cat_idx]
y_new = y[common_cat_idx]

In [104]:
len(train),len(y_new)

(875777, 875777)

In [108]:
np.unique(y_new)

array([ 0,  1,  4,  5,  6,  7,  8,  9, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 30, 32,
       34, 35, 36, 37, 38], dtype=int8)

In [105]:
df_x.columns,df_raw.columns

(Index(['PdDistrict', 'Address', 'X', 'Y', 'DatesYear', 'DatesMonth',
        'DatesWeek', 'DatesDay', 'DatesDayofweek', 'DatesDayofyear',
        'DatesIs_month_end', 'DatesIs_month_start', 'DatesIs_quarter_end',
        'DatesIs_quarter_start', 'DatesIs_year_end', 'DatesIs_year_start',
        'DatesHour', 'DatesMinute', 'DatesSecond', 'DatesElapsed',
        'xy_scaled_1', 'xy_scaled_2', 'xy_pca_1', 'xy_pca_2', 'xy_x_rotate_30',
        'xy_x_rotate_45', 'xy_x_rotate_60', 'xy_y_rotate_30', 'xy_y_rotate_45',
        'xy_y_rotate_60', 'xy_radial_r', 'xy_radial_phi',
        'ad_contains_intersection', 'ad_contains_block', 'add_p1', 'add_p2',
        'add_p1_type'],
       dtype='object'),
 Index(['Category', 'PdDistrict', 'Address', 'X', 'Y', 'DatesYear',
        'DatesMonth', 'DatesWeek', 'DatesDay', 'DatesDayofweek',
        'DatesDayofyear', 'DatesIs_month_end', 'DatesIs_month_start',
        'DatesIs_quarter_end', 'DatesIs_quarter_start', 'DatesIs_year_end',
        'DatesIs_year_

In [52]:
#df_x.drop(['xy_pca_1_na','xy_pca_2_na'],axis=1,inplace=True)

### 3. Build the model

In [106]:
# n_jobs=-1 helps us to run the jobs in parallel (one per CPU core)
m = RandomForestClassifier(n_jobs=-1)
# Build a forest of trees from the training set (X, y).
m.fit(df_x, y)
# The score() Returns the mean accuracy on the given test data and labels.
m.score(df_x,y)

0.847852454703553

In [107]:
# Let's try with the data with no rare categories
m = RandomForestClassifier(n_jobs=-1)
# Build a forest of trees from the training set (X, y).
m.fit(train, y_new)
# The score() Returns the mean accuracy on the given test data and labels.
m.score(train,y_new)

0.8485950190516536

This model we have is purely on training data, so the high score is not necessarily a good sign.

### Let's load the test data and process it.

In [109]:
df_raw_test = pd.read_csv(f'{PATH}test.csv',date_parser='Dates')

In [110]:
df_raw_test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [111]:
add_datepart(df=df_raw_test,fldname='Dates',time=True)

#### Let's add the X, Y features

In [112]:
# Lets' created scaled transform of the X and Y coordinates
xy_scaler = StandardScaler()
xy_scaler.fit(df_raw_test.loc[:,['X','Y']])
xy_scaled = xy_scaler.transform(df_raw_test.loc[:,['X','Y']])

# PCA on X and Y, select top 2 components
xy_pca = PCA(n_components=2, whiten=True).fit_transform(xy_scaled)
type(xy_pca), np.shape(xy_pca), xy_pca

# Rotation
rot45_X = .707* xy_scaled[:,1] + .707* xy_scaled[:,0] 
rot45_Y = .707* xy_scaled[:,1] - .707* xy_scaled[:,0]

rot30_X = (1.732/2)* xy_scaled[:,0] + (1./2)* xy_scaled[:,1]
rot30_Y = (1.732/2)* xy_scaled[:,1] - (1./2)* xy_scaled[:,0]

rot60_X = (1./2)* xy_scaled[:,0] + (1.732/2)* xy_scaled[:,1] 
rot60_Y = (1./2)* xy_scaled[:,1] - (1.732/2)* xy_scaled[:,0]

# Polar coordinates
radial_r = np.sqrt( np.power(xy_scaled[:,1],2) + np.power(xy_scaled[:,0],2) )
radial_phi = np.arctan2(xy_scaled[:,1], xy_scaled[:,0])

y_scaled_df=pd.DataFrame(xy_scaled)
xy_pca_df=pd.DataFrame(xy_pca)

In [113]:
df_raw_test['xy_scaled_1']    = xy_scaled_df[0]
df_raw_test['xy_scaled_2']    = xy_scaled_df[1]
df_raw_test['xy_pca_1']       = xy_pca_df[0]
df_raw_test['xy_pca_2']       = xy_pca_df[1]
df_raw_test['xy_x_rotate_30'] = rot30_X
df_raw_test['xy_x_rotate_45'] = rot45_X
df_raw_test['xy_x_rotate_60'] = rot60_X
df_raw_test['xy_y_rotate_30'] = rot30_Y
df_raw_test['xy_y_rotate_45'] = rot45_Y
df_raw_test['xy_y_rotate_60'] = rot60_Y
df_raw_test['xy_radial_r']    = radial_r
df_raw_test['xy_radial_phi']  = radial_phi

#### Let's add the address features

In [114]:
# Create a feature for street intersection
df_raw_test['ad_contains_intersection']=df_raw_test['Address'].str.contains('/')

# Create a feature for block or no-block in the address
add_upper=df_raw_test['Address'].str.upper()
df_raw_test['ad_contains_block']=add_upper.str.contains('BLOCK')

# Create a feature to capture the data before and after the '/'
add_p1, add_p2 = df_raw_test['Address'].str.split('/', 1).str

add_p1=add_p1.str.rstrip()
df_raw_test['add_p1']=add_p1.str.lstrip()
add_p2=add_p2.str.rstrip()
df_raw_test['add_p2']=add_p2.str.lstrip()

# Check if the split went well
df_raw_test[['Address','add_p1','add_p2']].head(15)

Unnamed: 0,Address,add_p1,add_p2
0,2000 Block of THOMAS AV,2000 Block of THOMAS AV,
1,3RD ST / REVERE AV,3RD ST,REVERE AV
2,2000 Block of GOUGH ST,2000 Block of GOUGH ST,
3,4700 Block of MISSION ST,4700 Block of MISSION ST,
4,4700 Block of MISSION ST,4700 Block of MISSION ST,
5,BROAD ST / CAPITOL AV,BROAD ST,CAPITOL AV
6,100 Block of CHENERY ST,100 Block of CHENERY ST,
7,200 Block of BANKS ST,200 Block of BANKS ST,
8,2900 Block of 16TH ST,2900 Block of 16TH ST,
9,TAYLOR ST / GREEN ST,TAYLOR ST,GREEN ST


In [115]:
# Type of the first part of address (ST, AV BL etc)
df_raw_test['add_p1_type']=df_raw_test['add_p1'].str[-2:]
np.unique(df_raw_test['add_p1_type'])

array(['80', 'AL', 'AR', 'AV', 'AY', 'BL', 'CR', 'CT', 'DR', 'ER', 'EX', 'HY', 'LN', 'MS', 'PL', 'PZ', 'RD',
       'RK', 'RW', 'ST', 'TR', 'WK', 'WY'], dtype=object)

In [116]:
df_raw_test.columns

Index(['Id', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'DatesYear',
       'DatesMonth', 'DatesWeek', 'DatesDay', 'DatesDayofweek',
       'DatesDayofyear', 'DatesIs_month_end', 'DatesIs_month_start',
       'DatesIs_quarter_end', 'DatesIs_quarter_start', 'DatesIs_year_end',
       'DatesIs_year_start', 'DatesHour', 'DatesMinute', 'DatesSecond',
       'DatesElapsed', 'xy_scaled_1', 'xy_scaled_2', 'xy_pca_1', 'xy_pca_2',
       'xy_x_rotate_30', 'xy_x_rotate_45', 'xy_x_rotate_60', 'xy_y_rotate_30',
       'xy_y_rotate_45', 'xy_y_rotate_60', 'xy_radial_r', 'xy_radial_phi',
       'ad_contains_intersection', 'ad_contains_block', 'add_p1', 'add_p2',
       'add_p1_type'],
      dtype='object')

In [117]:
# training dataset column
df_raw.columns

Index(['Category', 'PdDistrict', 'Address', 'X', 'Y', 'DatesYear',
       'DatesMonth', 'DatesWeek', 'DatesDay', 'DatesDayofweek',
       'DatesDayofyear', 'DatesIs_month_end', 'DatesIs_month_start',
       'DatesIs_quarter_end', 'DatesIs_quarter_start', 'DatesIs_year_end',
       'DatesIs_year_start', 'DatesHour', 'DatesMinute', 'DatesSecond',
       'DatesElapsed', 'xy_scaled_1', 'xy_scaled_2', 'xy_pca_1', 'xy_pca_2',
       'xy_x_rotate_30', 'xy_x_rotate_45', 'xy_x_rotate_60', 'xy_y_rotate_30',
       'xy_y_rotate_45', 'xy_y_rotate_60', 'xy_radial_r', 'xy_radial_phi',
       'ad_contains_intersection', 'ad_contains_block', 'add_p1', 'add_p2',
       'add_p1_type'],
      dtype='object')

In [118]:
# Let's drop the duplicate column - DayOfWeek and the extra 'Id' column.
df_raw_test.drop(['DayOfWeek','Id'],axis=1,inplace=True)

In [119]:
df_raw_test.head()

Unnamed: 0,PdDistrict,Address,X,Y,DatesYear,DatesMonth,DatesWeek,DatesDay,DatesDayofweek,DatesDayofyear,...,xy_y_rotate_30,xy_y_rotate_45,xy_y_rotate_60,xy_radial_r,xy_radial_phi,ad_contains_intersection,ad_contains_block,add_p1,add_p2,add_p1_type
0,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,2015,5,19,10,6,130,...,-0.437913,-0.580327,-0.683341,0.749475,-0.100412,False,True,2000 Block of THOMAS AV,,AV
1,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,2015,5,19,10,6,130,...,-0.572731,-0.768164,-0.911446,1.0092,-0.079882,True,False,3RD ST,REVERE AV,ST
2,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,2015,5,19,10,6,130,...,0.090433,0.105737,0.113863,0.115033,2.760624,False,True,2000 Block of GOUGH ST,,ST
3,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,19,10,6,130,...,0.1478,0.262429,0.359242,0.485556,-2.927289,False,True,4700 Block of MISSION ST,,ST
4,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,19,10,6,130,...,0.1478,0.262429,0.359242,0.485556,-2.927289,False,True,4700 Block of MISSION ST,,ST


In [120]:
# Now, convert text into numbers
# Changes any columns of strings in df into categorical variables using trn as a template for the category codes.
apply_cats(df=df_raw_test,trn=df_raw)

In [121]:
df_test_x = proc_df(df_raw_test)

In [122]:
df_test_x[0].head()

Unnamed: 0,PdDistrict,Address,X,Y,DatesYear,DatesMonth,DatesWeek,DatesDay,DatesDayofweek,DatesDayofyear,...,xy_y_rotate_60,xy_radial_r,xy_radial_phi,ad_contains_intersection,ad_contains_block,add_p1,add_p2,add_p1_type,xy_scaled_1_na,xy_scaled_2_na
0,1,6421,-122.399588,37.735051,2015,5,19,10,6,130,...,-0.683341,0.749475,-0.100412,False,True,5894,0,4,False,False
1,1,9761,-122.391523,37.732432,2015,5,19,10,6,130,...,-0.911446,1.0092,-0.079882,True,False,8307,1293,21,False,False
2,5,6350,-122.426002,37.792212,2015,5,19,10,6,130,...,0.113863,0.115033,2.760624,False,True,5823,0,21,False,False
3,3,10657,-122.437394,37.721412,2015,5,19,10,6,130,...,0.359242,0.485556,-2.927289,False,True,9034,0,21,False,False
4,3,10657,-122.437394,37.721412,2015,5,19,10,6,130,...,0.359242,0.485556,-2.927289,False,True,9034,0,21,False,False


In [123]:
# pd.concat is similar to rbind/cbind in R
pd.concat([df_raw[['PdDistrict']].head(10), df_x[['PdDistrict']].head(10)],axis=1)

Unnamed: 0,PdDistrict,PdDistrict.1
0,NORTHERN,5
1,NORTHERN,5
2,NORTHERN,5
3,NORTHERN,5
4,PARK,6
5,INGLESIDE,3
6,INGLESIDE,3
7,BAYVIEW,1
8,RICHMOND,7
9,CENTRAL,2


In [124]:
# pd.concat is similar to rbind/cbind in R
pd.concat([df_raw_test.loc[:,'PdDistrict'].head(10),df_test_x[0].loc[:,'PdDistrict'].head(10)],axis=1)

Unnamed: 0,PdDistrict,PdDistrict.1
0,BAYVIEW,1
1,BAYVIEW,1
2,NORTHERN,5
3,INGLESIDE,3
4,INGLESIDE,3
5,TARAVAL,9
6,INGLESIDE,3
7,INGLESIDE,3
8,MISSION,4
9,CENTRAL,2


In [125]:
type(df_test_x[0])

pandas.core.frame.DataFrame

Now let's try the model on this test data

In [126]:
df_test_x[0].shape

(884262, 39)

In [127]:
df_x.columns

Index(['PdDistrict', 'Address', 'X', 'Y', 'DatesYear', 'DatesMonth',
       'DatesWeek', 'DatesDay', 'DatesDayofweek', 'DatesDayofyear',
       'DatesIs_month_end', 'DatesIs_month_start', 'DatesIs_quarter_end',
       'DatesIs_quarter_start', 'DatesIs_year_end', 'DatesIs_year_start',
       'DatesHour', 'DatesMinute', 'DatesSecond', 'DatesElapsed',
       'xy_scaled_1', 'xy_scaled_2', 'xy_pca_1', 'xy_pca_2', 'xy_x_rotate_30',
       'xy_x_rotate_45', 'xy_x_rotate_60', 'xy_y_rotate_30', 'xy_y_rotate_45',
       'xy_y_rotate_60', 'xy_radial_r', 'xy_radial_phi',
       'ad_contains_intersection', 'ad_contains_block', 'add_p1', 'add_p2',
       'add_p1_type'],
      dtype='object')

In [128]:
df_test_x[0].columns

Index(['PdDistrict', 'Address', 'X', 'Y', 'DatesYear', 'DatesMonth',
       'DatesWeek', 'DatesDay', 'DatesDayofweek', 'DatesDayofyear',
       'DatesIs_month_end', 'DatesIs_month_start', 'DatesIs_quarter_end',
       'DatesIs_quarter_start', 'DatesIs_year_end', 'DatesIs_year_start',
       'DatesHour', 'DatesMinute', 'DatesSecond', 'DatesElapsed',
       'xy_scaled_1', 'xy_scaled_2', 'xy_pca_1', 'xy_pca_2', 'xy_x_rotate_30',
       'xy_x_rotate_45', 'xy_x_rotate_60', 'xy_y_rotate_30', 'xy_y_rotate_45',
       'xy_y_rotate_60', 'xy_radial_r', 'xy_radial_phi',
       'ad_contains_intersection', 'ad_contains_block', 'add_p1', 'add_p2',
       'add_p1_type', 'xy_scaled_1_na', 'xy_scaled_2_na'],
      dtype='object')

In [129]:
df_test = pd.DataFrame(df_test_x[0])
df_test.drop(['xy_scaled_1_na', 'xy_scaled_2_na'],axis=1,inplace=True)

In [130]:
df_test.columns

Index(['PdDistrict', 'Address', 'X', 'Y', 'DatesYear', 'DatesMonth',
       'DatesWeek', 'DatesDay', 'DatesDayofweek', 'DatesDayofyear',
       'DatesIs_month_end', 'DatesIs_month_start', 'DatesIs_quarter_end',
       'DatesIs_quarter_start', 'DatesIs_year_end', 'DatesIs_year_start',
       'DatesHour', 'DatesMinute', 'DatesSecond', 'DatesElapsed',
       'xy_scaled_1', 'xy_scaled_2', 'xy_pca_1', 'xy_pca_2', 'xy_x_rotate_30',
       'xy_x_rotate_45', 'xy_x_rotate_60', 'xy_y_rotate_30', 'xy_y_rotate_45',
       'xy_y_rotate_60', 'xy_radial_r', 'xy_radial_phi',
       'ad_contains_intersection', 'ad_contains_block', 'add_p1', 'add_p2',
       'add_p1_type'],
      dtype='object')

In [152]:
m = RandomForestClassifier(n_estimators=30, n_jobs=-1)
m.fit(train,y_new)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [153]:
results = m.predict(df_test)

In [154]:
results.shape

(884262,)

In [155]:
results[1:50]

array([21, 20, 20, 20, 21, 16, 36, 16, 16, 16,  1, 19, 16, 16, 21, 36, 35, 16, 16, 16, 36, 16,  1, 16, 36,
       16, 16, 16, 16, 16, 36, 16, 16, 16, 16, 16, 16, 36, 20, 16, 16,  1, 36, 36, 16, 16, 16, 16, 36],
      dtype=int8)

In [156]:
np.unique(results)

array([ 0,  1,  4,  5,  6,  7,  8,  9, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 30, 32,
       34, 35, 36, 37, 38], dtype=int8)

In [157]:
len(np.unique(results))

30

We have the predictions for each of the test set example.

### Pulse-check: Submit the existing predictions to Kaggle and check the accuracy

In [158]:
len(results)

884262

In [159]:
submission_file_accumulator=list()

for i in range(0,len(results)):
    temp=np.zeros(39,dtype=int)
    temp[results[i]]=1
    line=np.insert(temp,0,i)
    submission_file_accumulator.append(line)

In [160]:
len(submission_file_accumulator)

884262

In [161]:
tt = pd.DataFrame(submission_file_accumulator)

In [162]:
tt.shape

(884262, 40)

In [163]:
tt.to_csv('data/sf-crime/result_v5.csv',index=False)

In [105]:
df_raw.Category.cat.categories

Index(['ARSON', 'ASSAULT', 'BURGLARY', 'DISORDERLY CONDUCT',
       'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS',
       'EMBEZZLEMENT', 'FORGERY/COUNTERFEITING', 'FRAUD', 'KIDNAPPING',
       'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
       'NON-CRIMINAL', 'OTHER OFFENSES', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'STOLEN PROPERTY', 'SUSPICIOUS OCC', 'TRESPASS', 'VANDALISM',
       'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'],
      dtype='object')

In [106]:
#y.cat.categories

In [149]:
results[21:40]+1

array([21, 21,  2, 17, 37, 21, 17, 17, 17, 17, 37, 17, 17, 17, 17, 17, 17, 17, 17], dtype=int8)

In [86]:
np.unique(results)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
       25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38], dtype=int8)

In [159]:
tt.iloc[0:4,0:6]

Unnamed: 0,0,1,2,3,4,5
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0


In [157]:
tt.columns.values[0:5]

array([0, 1, 2, 3, 4])

### Fine-tuning 

Let's try to improve the basic RF model. <br><br>
There are number of ways we could try,
1. Increase n_estimators (number of trees)
2. Adding OOB score
3. Subsampling
4. Tree building parameters<br>
    4.1 Max depth<br>
    4.2 Min sample leaf<br>
    4.3 Max features

In [52]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [61]:
import random
# Let's select 80,0000 random numbers between 0 to 800,000.
val_idx = random.sample(range(800000), 80000)

In [121]:
# Creating a good validation set is really really important. 
test_idx=random.sample(range(800000), 80000)
# Create train dataset
X_valid=df_x.iloc[test_idx]
y_valid=y[test_idx]
# Create test dataset
X_train=df_x.drop(df_x.index[test_idx])
y_train=np.delete(y,test_idx,axis=0)
# Check dimensions
df_x.shape,X_train.shape,X_valid.shape,len(y),len(y_train),len(y_valid)

((878049, 20), (798049, 20), (80000, 20), 878049, 798049, 80000)

In [122]:
m = RandomForestClassifier(n_estimators=5, n_jobs=-1)
m.fit(X_train,y_train)
#print_score(m)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.8225560084656456, 0.267175)

In [123]:
m = RandomForestClassifier(n_estimators=15, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.8644318832552889, 0.3022625)

In [124]:
m = RandomForestClassifier(n_estimators=40, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.8667337469253141, 0.3206375)

In [125]:
m = RandomForestClassifier(n_estimators=80, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.8667938936080366, 0.324725)

As you can see adding more number of trees is not helping anymore.<br><br>Let's try the sub-sampling approach.

In [126]:
set_rf_samples(40000)

In [127]:
m = RandomForestClassifier(n_estimators=80, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.4005994619378008, 0.299075)

In [128]:
# Let's increase the size of bootstrap
set_rf_samples(80000)
m = RandomForestClassifier(n_estimators=30, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.4864425617975839, 0.2975625)

In [129]:
# Let's increase the size of bootstrap
set_rf_samples(120000)
m = RandomForestClassifier(n_estimators=50, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.617082409726721, 0.315325)

In [130]:
# Let's increase the size of bootstrap
set_rf_samples(30000)
m = RandomForestClassifier(n_estimators=120, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.3738993470325757, 0.2999875)

Random subsampling is not really helping. Let's try some other strategies.

In [131]:
# Revert to full boorstrap sample.
reset_rf_samples()

In [133]:
m = RandomForestClassifier(n_estimators=20, n_jobs=-1)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.8659205136526704, 0.3097125)

Let's play around with tree parameters.

In [134]:
m = RandomForestClassifier(n_estimators=20, n_jobs=-1,min_samples_leaf=3)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.756560060848394, 0.3237)

In [135]:
m = RandomForestClassifier(n_estimators=30, n_jobs=-1,min_samples_leaf=3)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.7696507357317658, 0.3303125)

In [136]:
m = RandomForestClassifier(n_estimators=50, n_jobs=-1,min_samples_leaf=3)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.7808292473269185, 0.335425)

In [137]:
m = RandomForestClassifier(n_estimators=50, n_jobs=-1,min_samples_leaf=2)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.8470357083336988, 0.3301125)

In [138]:
# Let's add max features attributes
m = RandomForestClassifier(n_estimators=30, n_jobs=-1,min_samples_leaf=3,max_features=0.6)
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.7971540594625142, 0.3322125)

In [139]:
# Let's add max features = sqrt
m = RandomForestClassifier(n_estimators=30, n_jobs=-1,min_samples_leaf=3,max_features='sqrt')
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.7684866468099076, 0.3310125)

In [140]:
# Let's add max features = sqrt, n_estimators = 120
m = RandomForestClassifier(n_estimators=120, n_jobs=-1,min_samples_leaf=3,max_features='sqrt')
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.7897121605314962, 0.338625)

In [141]:
# Let's add max features = log, n_estimators = 200
m = RandomForestClassifier(n_estimators=200, n_jobs=-1,min_samples_leaf=3,max_features='log2')
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.7924175081981182, 0.33905)

In [142]:
# Let's add max features = log, n_estimators = 200
m = RandomForestClassifier(n_estimators=200, n_jobs=-1,min_samples_leaf=3,max_features='sqrt')
m.fit(X_train,y_train)
m.score(X_train,y_train),m.score(X_valid, y_valid)

(0.7922796720502124, 0.3399875)

In [143]:
results = m.predict(df_test)

In [144]:
results.shape

(884262,)

In [145]:
results[1:50]

array([21, 16, 20, 20, 21, 36, 16,  1, 16, 16, 16, 16, 16, 16,  7, 21, 36, 16, 16, 16, 16, 16,  1, 16, 16,
       16, 16, 16, 16, 16, 36, 16, 16, 16, 16, 21, 16, 16, 16, 16, 16, 20, 36, 36, 16, 16, 16, 16, 36],
      dtype=int8)

In [146]:
np.unique(results)

array([ 0,  1,  4,  5,  7,  8, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 31, 32, 34,
       35, 36, 37, 38], dtype=int8)

In [147]:
submission_file_accumulator=list()

for i in range(0,len(results)):
    temp=np.zeros(39,dtype=int)
    temp[results[i]]=1
    line=np.insert(temp,0,i)
    submission_file_accumulator.append(line)

In [148]:
len(submission_file_accumulator)

884262

In [149]:
tt = pd.DataFrame(submission_file_accumulator)
tt.shape

(884262, 40)

In [150]:
tt.to_csv('data/Kaggle/competitions/sf-crime/result_v2.csv',index=False)