In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn.metrics as skm
import time
import operator
import re
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import grid_search

%matplotlib inline

<b> FUNCTIONS

<b> GET DATA AND DEFINE TEST/TRAIN

In [2]:
train = pd.read_csv(".\\train.csv",parse_dates=[0])
train["key"] = 'Train'
test = pd.read_csv(".\\test.csv",parse_dates=[1])
test["key"] = 'Test'

In [3]:
df = pd.concat([train,test])
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

<b> FEATURE ENGINEERING

In [4]:
dat = pd.DataFrame(df["Dates"])
dat = dat.drop_duplicates()
dat.reset_index(inplace=True)
del dat["index"]
dat["linear_time"] = dat.index
df = df.merge(dat,how='left', left_on='Dates',right_on='Dates')

In [5]:
df["New_Cat"] = "NON TOP FOUR"
df["New_Cat"].ix[df["Category"] =="OTHER OFFENSES"] = "OTHER OFFENSES"
df["New_Cat"].ix[df["Category"] =="LARCENY/THEFT"] = "LARCENY/THEFT"
df["New_Cat"].ix[df["Category"] =="NON-CRIMINAL"] = "NON-CRIMINAL"
df["New_Cat"].ix[df["Category"] =="ASSAULT"] = "ASSAULT"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
start = time.time()
df["hour"] = df.Dates.apply(lambda x: x.hour)
df["month"] = df.Dates.apply(lambda x: x.month)
df["week"] = df.Dates.apply(lambda x: x.week)
df["year"] = df.Dates.apply(lambda x: x.year)
df["day"] = df.Dates.apply(lambda x: x.day)
end = time.time()
print(end - start)

68.50351977348328


In [7]:
df["weekday"] = df.Dates.apply(lambda x: x.weekday)
df["weekofyear"] = df.Dates.apply(lambda x: x.weekofyear)
df["dayofyear"] = df.Dates.apply(lambda x: x.dayofyear)

In [8]:
dow = {
'Monday':0,
'Tuesday':1,
'Wednesday':2,
'Thursday':3,
'Friday':4,
'Saturday':5,
'Sunday':6
}
df["DOW"] = df.DayOfWeek.map(dow)

In [9]:
df["X_shift_small"] = df["X"].apply(lambda x: x * 10).astype(int)
df["Y_shift_small"] = df["Y"].apply(lambda x: x * 10).astype(int)

df["X_shift"] = df["X"].apply(lambda x: x * 100).astype(int)
df["Y_shift"] = df["Y"].apply(lambda x: x * 100).astype(int)

df["X_shift_big"] = df["X"].apply(lambda x: (x * 1000)/7).astype(int)
df["Y_shift_big"] = df["Y"].apply(lambda x: (x * 1000)/7).astype(int)

In [10]:
def TOD(x):
    if x in range(8,21):
        return "Day"
    else:
        return "Night"
df["TOD"] = df.hour.map(TOD)

In [11]:
xy_scaler=preprocessing.StandardScaler()
xy_scaler.fit(df[["X","Y"]])
df["X_fit"] = df["X"]
df["Y_fit"] = df["Y"]
df[["X_fit","Y_fit"]]=xy_scaler.transform(df[["X","Y"]])

In [12]:
df["rot45_X"] = .707* df["Y_fit"] + .707* df["X_fit"] 
df["rot45_Y"] = .707* df["Y_fit"] - .707* df["X_fit"]

df["rot30_X"] = (1.732/2)* df["X_fit"] + (1./2)* df["Y_fit"] 
df["rot30_Y"] = (1.732/2)* df["Y_fit"] - (1./2)* df["X_fit"]

df["rot60_X"] = (1./2)* df["X_fit"] + (1.732/2)* df["Y_fit"] 
df["rot60_Y"] = (1./2)* df["Y_fit"] - (1.732/2)* df["X_fit"]

df["radial_r"] = np.sqrt( np.power(df["Y_fit"],2) + np.power(df["X_fit"],2) )

In [13]:
df["X_shift_big_rot"] = df["rot30_X"].apply(lambda x: (x * 10)).astype(int)
df["Y_shift_big_rot"] = df["rot30_Y"].apply(lambda x: (x * 10)).astype(int)
df["rad"] = df["radial_r"].apply(lambda x: x * 10).astype(int)

In [14]:
def address(x):
    if re.search("/",x) == None:
        return 0
    else:
        return 1

In [15]:
df["Corner"] = df["Address"].map(address)

In [16]:
def street_one(x):
    return x.split()[-2]
    

In [17]:
def street_two(x):
    return x.split()[0]

In [18]:
def add_number(x):
    return x.split()[0]

In [19]:
df["Street_One"] = 0
df["Street_Two"] = 0
df["Address_Number"] = 100000

In [20]:
df["Street_One"] = df["Address"].map(street_one)
df["Street_Two"].loc[(df["Corner"]==1)] = df["Address"].map(street_two)
df["Address_Number"].loc[(df["Corner"]==0)]= df["Address"].map(add_number)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [21]:
df["PD_factor"] = pd.factorize(df["PdDistrict"])[0]
df["Address_factor"] = pd.factorize(df["Address"])[0]
df["Street_One"] = pd.factorize(df["Street_One"])[0]
df["Street_Two"] = pd.factorize(df["Street_Two"])[0]
df["Address_Number"] = pd.factorize(df["Address_Number"])[0]

<b> TESTING FEATURES

In [22]:
origtrainmask = df['key'] == 'Train'
origtestmask = df['key'] == 'Test'
origtraindf = df[origtrainmask]
origtestdf = df[origtestmask]
#origtestdf.reset_index(inplace=True,drop=True)

In [127]:
mod = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True)
mod.fit(origtraindf[["Address_Number","Street_One","Street_Two","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]].as_matrix(),origtraindf[["Category"]].as_matrix().ravel())
x = pd.DataFrame()
origtestdf.reset_index(inplace=True,drop=True)
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod.predict_proba(origtestdf[["Address_Number","Street_One","Street_Two","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x = x.append(a)

In [128]:
mod2 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True)
mod2.fit(origtraindf[["X_fit","Y_fit","rad","year",\
"hour","PD_factor"]].as_matrix(),origtraindf[["Category"]].as_matrix().ravel())

x2 = pd.DataFrame()
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod2.predict_proba(origtestdf[["X_fit","Y_fit","rad","year",\
"hour","PD_factor"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x2 = x2.append(a)

In [129]:
mod3 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True)
mod3.fit(origtraindf[["Corner","Address_factor","X_fit","Y_fit","year",\
"hour","PD_factor"]].as_matrix(),origtraindf[["Category"]].as_matrix().ravel())
x3 = pd.DataFrame()
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod3.predict_proba(origtestdf[["Corner","Address_factor","X_fit","Y_fit","year",\
"hour","PD_factor"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x3 = x3.append(a)

In [130]:
wx = x * .6
wx2 = x2 * .2
wx3 = x3 * .2

y = wx + wx2 + wx3

In [131]:
y.reset_index(inplace=True,drop=True)
y.columns = mod.classes_
y = y.merge(origtestdf[["Id"]],how='left',left_index=True,right_index=True)
y = y.fillna(0)
y["Id"] = y["Id"].astype(int)
y["Id"] = y["Id"].astype(str)

In [132]:
for col in y.columns:
    if col != "Id":
        y[col] = y[col].round(decimals=6)

In [135]:
y.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS,Id
0,0.004052,0.132886,0.000123,0.001043,0.037646,0.00186,0.003572,0.023707,0.00187,0.001371,...,0.005565,0.000299,0.054528,1e-06,0.006325,0.079582,0.127304,0.026018,0.019444,0
1,0.000537,0.083574,1.5e-05,0.000184,0.002773,0.002664,0.009085,0.063781,0.00314,0.000298,...,0.001639,0.00033,0.03607,0.0,0.001558,0.016911,0.053405,0.088856,0.027568,1
2,0.005087,0.05942,0.000276,1.7e-05,0.100746,0.000992,0.001073,0.01709,0.003163,0.000261,...,0.006514,0.000189,0.02788,0.0,0.007708,0.060943,0.070925,0.018652,0.002757,2
3,0.000704,0.130627,7.6e-05,0.001466,0.033928,0.003759,0.002234,0.055586,0.008239,0.00011,...,0.013286,0.000207,0.042843,0.0,0.004329,0.07678,0.083437,0.059459,0.019407,3
4,0.000704,0.130627,7.6e-05,0.001466,0.033928,0.003759,0.002234,0.055586,0.008239,0.00011,...,0.013286,0.000207,0.042843,0.0,0.004329,0.07678,0.083437,0.059459,0.019407,4


In [136]:
y.to_csv(".\\Fourteenth_RF_combo_of_three_RF.csv",index=False)

In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["Address_Number","Street_One","Street_Two","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]],origtraindf[["Category"]],\
        test_size = .2, random_state=25)

mod = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True,oob_score = True)
mod.fit(origtraindf[["Address_Number","Street_One","Street_Two","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]].as_matrix(),origtraindf[["Category"]].as_matrix().ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=True)

In [45]:
x = pd.DataFrame()
origtestdf.reset_index(inplace=True,drop=True)
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod.predict_proba(origtestdf[["Address_Number","Street_One","Street_Two","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x = x.append(a)

In [25]:
skm.log_loss(pd.get_dummies(Y_test),x.as_matrix())

2.3292350922796241

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["X_fit","Y_fit","rad","year",\
"hour","PD_factor"]],origtraindf[["Category"]],\
        test_size = .2, random_state=25)

mod2 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True,oob_score = True)
mod2.fit(origtraindf[["X_fit","Y_fit","rad","year",\
"hour","PD_factor"]].as_matrix(),origtraindf[["Category"]].as_matrix().ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=True)

In [47]:
x2 = pd.DataFrame()
X_test.reset_index(inplace=True,drop=True)
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod2.predict_proba(origtestdf[["X_fit","Y_fit","rad","year",\
"hour","PD_factor"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x2 = x2.append(a)

In [28]:
skm.log_loss(pd.get_dummies(Y_test),x2.as_matrix())

2.3713926496101774

In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(origtraindf[["Corner","Address_factor","X_fit","Y_fit","year",\
"hour","PD_factor"]],origtraindf[["Category"]],test_size = .2, random_state=25)

mod3 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True,oob_score = True)
mod3.fit(origtraindf[["Corner","Address_factor","X_fit","Y_fit","year",\
"hour","PD_factor"]],origtraindf[["Category"]].as_matrix().ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=True)

In [49]:
x3 = pd.DataFrame()
X_test.reset_index(inplace=True,drop=True)
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod3.predict_proba(origtestdf[["Corner","Address_factor","X_fit","Y_fit","year",\
"hour","PD_factor"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x3 = x3.append(a)

In [31]:
skm.log_loss(pd.get_dummies(Y_test),x3.as_matrix())

2.3369653311294658

In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(origtraindf[["Address_factor","X_fit","Y_fit","year",\
"hour"]],origtraindf[["Category"]],test_size = .2, random_state=25)

mod4 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True,oob_score = True)
mod4.fit(origtraindf[["Address_factor","X_fit","Y_fit","year",\
"hour"]],origtraindf[["Category"]].as_matrix().ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=True)

In [51]:
x4 = pd.DataFrame()
X_test.reset_index(inplace=True,drop=True)
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod4.predict_proba(origtestdf[["Address_factor","X_fit","Y_fit","year",\
"hour"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x4 = x4.append(a)

In [34]:
skm.log_loss(pd.get_dummies(Y_test),x4.as_matrix())

2.3628714744073704

In [52]:
X_train, X_test, Y_train, Y_test = train_test_split(origtraindf[["Corner","Address_factor","X_fit","Y_fit","year"]]\
                                                    ,origtraindf[["Category"]],test_size = .2, random_state=25)

mod5 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True,oob_score = True)
mod5.fit(origtraindf[["Corner","Address_factor","X_fit","Y_fit","year"]],origtraindf[["Category"]].as_matrix().ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=True)

In [53]:
x5 = pd.DataFrame()
X_test.reset_index(inplace=True,drop=True)
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod5.predict_proba(origtestdf[["Corner","Address_factor","X_fit","Y_fit","year"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x5 = x5.append(a)

In [54]:
X_train, X_test, Y_train, Y_test = train_test_split(origtraindf[["Street_One","Street_Two","Address_Number","X_fit","Y_fit","year"]]\
                                                    ,origtraindf[["Category"]],test_size = .2, random_state=25)

mod6 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,warm_start=True,oob_score = True)
mod6.fit(origtraindf[["Street_One","Street_Two","Address_Number","X_fit","Y_fit","year"]],origtraindf[["Category"]].as_matrix().ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=True)

In [55]:
x6 = pd.DataFrame()
X_test.reset_index(inplace=True,drop=True)
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    a = pd.DataFrame(mod6.predict_proba(origtestdf[["Street_One","Street_Two","Address_Number","X_fit","Y_fit","year"]].loc[(origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    x6 = x6.append(a)

In [56]:
total_score = mod.oob_score_ + mod2.oob_score_ + mod3.oob_score_ + mod4.oob_score_\
+ mod5.oob_score_ + mod6.oob_score_

wx = x * (mod.oob_score_/total_score)
wx2 = x2 * (mod2.oob_score_/total_score)
wx3 = x3 * (mod3.oob_score_/total_score)
wx4 = x4 * (mod4.oob_score_/total_score)
wx5 = x5 * (mod5.oob_score_/total_score)
wx6 = x6 * (mod6.oob_score_/total_score)

y = wx + wx2 + wx3 + wx4 + wx5 + wx6

#skm.log_loss(pd.get_dummies(Y_test).as_matrix(),y.as_matrix())

In [59]:

y = wx + wx2 + wx3 + wx4 + wx5 + wx6

In [60]:
y.reset_index(inplace=True,drop=True)
y.columns = mod.classes_
y = y.merge(origtestdf[["Id"]],how='left',left_index=True,right_index=True)
y = y.fillna(0)
y["Id"] = y["Id"].astype(int)
y["Id"] = y["Id"].astype(str)

for col in y.columns:
    if col != "Id":
        y[col] = y[col].round(decimals=6)
        
y.to_csv(".\\Fifteenth_RF.csv",index=False)

In [110]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x.as_matrix())

2.3315711801803829

In [111]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x2.as_matrix())

2.3753607677413036

In [None]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x3.as_matrix())

In [None]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x4.as_matrix())

In [None]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x5.as_matrix())

In [None]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x6.as_matrix())

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["X_fit","Y_fit","Corner"]],origtraindf[["Category"]],\
        test_size = .2, random_state=19)
mod4 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=20\
                             ,n_jobs=-1,verbose=True,warm_start=True)
mod4.fit(X_train,Y_train.as_matrix().ravel())
x4 = pd.DataFrame(mod4.predict_proba(X_test))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    5.5s finished


In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["X_fit","Y_fit","hour"]],origtraindf[["Category"]],\
        test_size = .2, random_state=19)
mod5 = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=20\
                             ,n_jobs=-1,verbose=True,warm_start=True)
mod5.fit(X_train,Y_train.as_matrix().ravel())
x5 = pd.DataFrame(mod5.predict_proba(X_test))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    5.7s finished


In [None]:
x = pd.DataFrame(mod.predict_proba(origtestdf[""])))

In [115]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x3.as_matrix())

2.4011392023905449

In [116]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x4.as_matrix())

2.4046187323462394

In [110]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["X_fit","month","hour","year"]],origtraindf[["Category"]],\
        test_size = .2, random_state=19)
mod6 = RandomForestClassifier(n_estimators=50,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,verbose=True,warm_start=True)
mod6.fit(X_train,Y_train.as_matrix().ravel())
x6 = pd.DataFrame(mod6.predict_proba(X_test))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.0s finished


In [117]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x5.as_matrix())

2.4239100930434287

In [104]:
skm.log_loss(pd.get_dummies(Y_test).as_matrix(),x6.as_matrix())

2.5326760113306572

In [18]:
mod = RandomForestClassifier(n_estimators=50,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,verbose=True,warm_start=True)
for i in range(4):
    X_train_add,Y_train_add = resample(X_train,Y_train)
    mod.fit(X_train_add,Y_train_add.as_matrix().ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.5s finished
  warn("Warm-start fitting without increasing n_estimators does not "
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished


2.4136487723959275

In [20]:
mod = RandomForestClassifier(n_estimators=50,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,verbose=True,warm_start=True)
for i in range(4):
    trees = 50 + (50*i)
    mod.set_params(n_estimators=trees)
    X_train_add,Y_train_add = resample(X_train,Y_train)
    mod.fit(X_train_add,Y_train_add.as_matrix().ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   41.9s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.2s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  1.1min finished


2.3868404968595671

In [25]:
mod = RandomForestClassifier(min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,verbose=True,warm_start=True)
for i in range(8):
    trees = 25 + (25*i)
    mod.set_params(n_estimators=trees)
    X_train_add,Y_train_add = StratifiedShuffleSplit(X_train,Y_train,n_samples=350000)
    mod.fit(X_train_add,Y_train_add.as_matrix().ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.3s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.3s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.4s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.5s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.4s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.3s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.3s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   11.8s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   14.4s finished


ValueError: operands could not be broadcast together with shapes (175610,39) (175610,38) (175610,39) 

In [30]:
X = StratifiedShuffleSplit(X_train,test_size=.0001,random_state=3)
for _,test in X:
    print(test)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of labels for any class cannot be less than 2.

In [22]:
len(X_train)

702439

In [21]:
mod = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
    ,n_jobs=-1,verbose=True,warm_start=True)
mod.fit(X_train,Y_train.as_matrix().ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    4.1s finished


2.3838382669871492

In [None]:
for i in range(4):
    X_train_add,Y_train_add = resample(X_train,Y_train)
    X_train = np.concatenate((X_train,X_train_add))
    Y_train = np.concatenate((Y_train,Y_train_add))
mod = RandomForestClassifier(n_estimators=50,min_samples_leaf=25,max_depth=15,n_jobs=-1)

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["X_fit","Y_fit","year","month","hour","PD_factor"]],origtraindf[["New_Cat"]],\
        test_size = .2, random_state=19)
mod = GradientBoostingClassifier(n_estimators=40,learning_rate=.7,\
                                 min_samples_leaf=25,max_depth=3,verbose=1)
mod.fit(X_train,Y_train.ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

      Iter       Train Loss   Remaining Time 
         1      978092.6246            2.98m
         2      964407.4593            2.79m
         3      959784.2409            2.67m
         4      957172.9023            2.56m
         5      954168.3498            2.48m
         6      950979.3228            2.40m
         7      949123.6875            2.33m
         8      947979.6759            2.26m
         9      946750.1123            2.19m
        10      946029.3889            2.11m
        20      938194.8992            1.39m
        30      933529.2112           41.35s
        40      929994.3069            0.00s


1.3262297762529183

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["X_fit","Y_fit","year","month","hour","PD_factor"]],origtraindf[["Category"]],\
                                                    test_size = .2, random_state=19)
mod = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15,n_jobs=-1)
mod.fit(X_train,Y_train.ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

2.3837611035727582

In [33]:
mod.feature_importances_

array([ 0.06981988,  0.12556554,  0.19709628,  0.2474611 ,  0.06906078,
        0.03681681,  0.0844196 ,  0.10704838,  0.06271163])

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(\
origtraindf[["Corner","Address_factor","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]],\
origtraindf[["Category"]],test_size = .2, random_state=19)

mod = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,verbose=True)

mod.fit(X_train,Y_train.as_matrix().ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    4.8s finished


2.3452532382772509

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(\
    origtraindf[["X_fit","Y_fit","rad","DOW","year","week","month","hour","PD_factor"]],\
    origtraindf[["Category"]],test_size = .2, random_state=19)
mod = RandomForestClassifier(n_estimators=100,min_samples_leaf=25,max_depth=15\
                             ,n_jobs=-1,class_weight={'LARCENY/THEFT':.75})
mod.fit(X_train,Y_train.as_matrix().ravel())
x = mod.predict_proba(X_test)
skm.log_loss(Y_test,x)

2.3814574630399585

In [19]:
mod.feature_importances_

array([ 0.0725765 ,  0.11487591,  0.16964113,  0.22285616,  0.07273628,
        0.02763808,  0.07120072,  0.03180305,  0.02316373,  0.03414093,
        0.09195614,  0.06741137])

In [48]:
act = pd.DataFrame(Y_test)
act = pd.get_dummies(act[0])
pred = pd.DataFrame(x,columns=act.columns)

In [49]:
dic = {}
lss = 0
for col in act.columns:
    loss = skm.log_loss(act[col],pred[col].as_matrix())
    dic[col] = loss
    lss = lss+loss
    
sorted_dic = sorted(dic.items(), key=operator.itemgetter(1))
sorted_dic

[('ASSAULT', 0.28524562792597624),
 ('NON-CRIMINAL', 0.31867741575960212),
 ('OTHER OFFENSES', 0.39237743958892879),
 ('LARCENY/THEFT', 0.43953792109229567),
 ('NON TOP FOUR', 0.66121315832486383)]

In [59]:
l = "HA/HA"
b = re.search("k",l)

In [None]:
mod = RandomForestClassifier(n_estimators=105,min_samples_leaf=25,max_depth=15,n_jobs=-1)
mod.fit(origtraindf[["Corner","Address_factor","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]],\
        origtraindf[["Category"]].as_matrix().ravel())
x = mod.predict_proba(origtestdf[["Corner","Address_factor","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]])

In [29]:
mod = RandomForestClassifier(n_estimators=200,min_samples_leaf=25,max_depth=15,n_jobs=-1)
mod.fit(origtraindf[["Corner","Address_factor","X_fit","Y_fit","rad","DOW","year",\
"hour","PD_factor"]],\
        origtraindf[["Category"]].as_matrix().ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
start = time.time()
tester = pd.DataFrame()
rounds = np.ceil(((origtestdf.index.max()+1)/25000))
for i in range(int(rounds)):
    minn = i * 25000
    maxx = 25000 + minn
    x = pd.DataFrame(mod.predict_proba(origtestdf[["Corner","Address_factor","X_fit"\
        ,"Y_fit","rad","DOW","year","hour","PD_factor"]].loc[(\
                    origtestdf.index>=minn) & (origtestdf.index<maxx)]))
    tester = tester.append(x)
end = time.time()
print(end - start)

59.44557595252991


In [31]:
tester.reset_index(inplace=True,drop=True)
tester.columns = mod.classes_
tester = tester.merge(origtestdf[["Id"]],how='left',left_index=True,right_index=True)
tester = tester.fillna(0)
tester["Id"] = tester["Id"].astype(int)
tester["Id"] = tester["Id"].astype(str)

In [32]:
for col in tester.columns:
    if col != "Id":
        tester[col] = tester[col].round(decimals=6)

In [33]:
tester.to_csv(".\\Thirteenth_RF.csv",index=False)

In [39]:
del x["NON TOP FOUR"],x["Id"]

In [27]:
prev_data_to_add = pd.read_csv(".\Fourth_RF.csv",index=False)

In [41]:
del prev_data_to_add["LARCENY/THEFT"],prev_data_to_add["ASSAULT"],prev_data_to_add["NON-CRIMINAL"],prev_data_to_add["OTHER OFFENSES"],

In [43]:
prev_data_to_add = prev_data_to_add.merge(x,left_index=True,right_index=True)

In [45]:
prev_data_to_add.to_csv(".\Top_Four_Isolated_First.csv")