In [475]:
# this will eventually be taken out of a ipython notebook and into a .py file, so don't worry about making it pretty. 
# any data exploration done here will be transferred to the data exploration file.

In [476]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt

# enables inline plots
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 3)

df = pd.read_csv('/Users/eloiserosen/Downloads/train.csv')
df_test = pd.read_csv('/Users/eloiserosen/Downloads/test.csv')

#df = pd.read_csv('/Users/eloiserosen/Downloads/train_small_10k.csv')
#df_test = pd.read_csv('/Users/eloiserosen/Downloads/test_small_10k.csv')
#del df['Unnamed: 0']
#del df_test['Unnamed: 0']

In [477]:
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.426,37.775
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.426,37.775
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424,37.8
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.427,37.801
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.439,37.772


In [478]:
df_test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.4,37.735
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.392,37.732
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426,37.792
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437,37.721
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437,37.721


In [479]:
# code to generate small versions of files

'''
df_small = df.tail(10000)
df_small.to_csv('train_small_10k.csv')

df_test_small = df_test.tail(10000)
df_test_small.to_csv('test_small_10k.csv')
'''


"\ndf_small = df.tail(10000)\ndf_small.to_csv('train_small_10k.csv')\n\ndf_test_small = df_test.tail(10000)\ndf_test_small.to_csv('test_small_10k.csv')\n"

In [480]:
def clean_data(df):
    feature_list=df.columns.tolist()
    
    # drop columns we don't need
    if 'Descript' in feature_list:
        del df['Descript']
    if 'Resolution' in feature_list:
        del df['Resolution']
    
    # create columns based on timestamp
    date_time = pd.to_datetime(df['Dates'])
    year = date_time.dt.year
    df['Year'] = year
    month = date_time.dt.month
    df['Month'] = month
    week = date_time.dt.week
    df['Week'] = week
    day = date_time.dt.day
    df['Day'] = day
    hour = date_time.dt.hour
    df['Hour'] = hour
    #some crimes are logged at a precise time. Others, like some thefts, have rounded time
    minute = date_time.dt.minute - 30
    df['Minute'] = minute
    #time = hour*60+minute # counting minutes
    #df['Time'] = time
    del df['Dates']
    
    # column to indicate if address was on a block
    df['StreetCorner'] = df['Address'].str.contains('/').map(int)
    
    # drop remaining address info for now
    del df['Address']
    
    return df
    

    

In [481]:
df = clean_data(df)
df.head()


Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner
0,WARRANTS,Wednesday,NORTHERN,-122.426,37.775,2015,5,20,13,23,1433,1
1,OTHER OFFENSES,Wednesday,NORTHERN,-122.426,37.775,2015,5,20,13,23,1433,1
2,OTHER OFFENSES,Wednesday,NORTHERN,-122.424,37.8,2015,5,20,13,23,1413,1
3,LARCENY/THEFT,Wednesday,NORTHERN,-122.427,37.801,2015,5,20,13,23,1410,0
4,LARCENY/THEFT,Wednesday,PARK,-122.439,37.772,2015,5,20,13,23,1410,0


In [482]:
number_categories = df.Category.nunique()
print number_categories

39


In [483]:
df_test = clean_data(df_test)
df_test.head()

Unnamed: 0,Id,DayOfWeek,PdDistrict,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner
0,0,Sunday,BAYVIEW,-122.4,37.735,2015,5,19,10,23,1439,0
1,1,Sunday,BAYVIEW,-122.392,37.732,2015,5,19,10,23,1431,1
2,2,Sunday,NORTHERN,-122.426,37.792,2015,5,19,10,23,1430,0
3,3,Sunday,INGLESIDE,-122.437,37.721,2015,5,19,10,23,1425,0
4,4,Sunday,INGLESIDE,-122.437,37.721,2015,5,19,10,23,1425,0


In [484]:
# not sure why this is being such a cunt when i try to put it in the clean_data def

# set up dummies
dummy_DayOfWeek = pd.get_dummies(df['DayOfWeek'], prefix='Day')
del dummy_DayOfWeek['Day_Friday']
del df['DayOfWeek']
df = df.join(dummy_DayOfWeek)
dummy_PdDistrict = pd.get_dummies(df['PdDistrict'], prefix='District')
del dummy_PdDistrict['District_SOUTHERN']
del df['PdDistrict']
df = df.join(dummy_PdDistrict)


dummy_DayOfWeek = pd.get_dummies(df_test['DayOfWeek'], prefix='Day')
del dummy_DayOfWeek['Day_Friday']
del df_test['DayOfWeek']
df_test = df_test.join(dummy_DayOfWeek)
dummy_PdDistrict = pd.get_dummies(df_test['PdDistrict'], prefix='District')
del dummy_PdDistrict['District_SOUTHERN']
del df_test['PdDistrict']
df_test = df_test.join(dummy_PdDistrict)

In [485]:
df.head()

Unnamed: 0,Category,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,WARRANTS,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,OTHER OFFENSES,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,OTHER OFFENSES,-122.424,37.8,2015,5,20,13,23,1413,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,LARCENY/THEFT,-122.427,37.801,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,LARCENY/THEFT,-122.439,37.772,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [486]:
df_test.head()

Unnamed: 0,Id,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,0,-122.4,37.735,2015,5,19,10,23,1439,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
1,1,-122.392,37.732,2015,5,19,10,23,1431,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,2,-122.426,37.792,2015,5,19,10,23,1430,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,3,-122.437,37.721,2015,5,19,10,23,1425,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,4,-122.437,37.721,2015,5,19,10,23,1425,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [487]:
# as noted in data exploration file, there are some latitude and longitude values that are obviously incorrect. 
#Impute these with the median.

# fill incorrect values with NaN
df['X'].replace(-120.5, np.nan, inplace = True)
df['Y'].replace(90, np.nan, inplace = True)

# find median for median imputation. Save values so I can reuse for test file.
medianX = df['X'].median()
medianY = df['Y'].median()

# median imputation
df['X'] = df['X'].fillna(medianX)
df['Y'] = df['Y'].fillna(medianY)

In [488]:
#median imputation in test file
# fill incorrect values with NaN
df_test['X'].replace(-120.5, np.nan, inplace = True)
df_test['Y'].replace(90, np.nan, inplace = True)

# median imputation
df_test['X'] = df_test['X'].fillna(medianX)
df_test['Y'] = df_test['Y'].fillna(medianY)

## Target Vector and Feature Matrix

In [489]:
#target vector y
y = df['Category']
y.head()

0          WARRANTS
1    OTHER OFFENSES
2    OTHER OFFENSES
3     LARCENY/THEFT
4     LARCENY/THEFT
Name: Category, dtype: object

In [490]:
#Matrix of X's.
X = df
del X['Category']
X.head()

Unnamed: 0,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,-122.424,37.8,2015,5,20,13,23,1413,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,-122.427,37.801,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,-122.439,37.772,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [491]:
# scale data with zero mean and unit variance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True)  
# make a df called x_continous that has just our continous features
ContinuousCols = ['X','Y', 'Year', 'Week', 'Day', 'Hour', 'Minute']
X_continuous = X[ContinuousCols]

# scale to zero mean and unit variance
X_continuous = scaler.fit(X_continuous).transform(X_continuous)
X_continuous = pd.DataFrame(X_continuous, columns = ContinuousCols)

# delete unscaled cols form original X df
X = X.drop(ContinuousCols, axis=1)

# merge 
X = pd.concat([X_continuous, X], axis=1)
X.head()

Unnamed: 0,X,Y,Year,Week,Day,Hour,Time,Month,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,-0.124,0.313,1.732,-0.426,-0.293,1.464,1.545,5,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,-0.124,0.313,1.732,-0.426,-0.293,1.464,1.545,5,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,-0.063,1.381,1.732,-0.426,-0.293,1.464,1.494,5,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,-0.167,1.4,1.732,-0.426,-0.293,1.464,1.486,5,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,-0.632,0.186,1.732,-0.426,-0.293,1.464,1.486,5,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [492]:
# scale test data with zero mean and unit variance as well. Use same scaler object I created on my training data.

kaggle_X = df_test

# make a df called x_continous that has just our continous features
kaggle_X_continuous = kaggle_X[ContinuousCols]
# scale to zero mean and unit variance
kaggle_X_continuous = scaler.transform(kaggle_X_continuous)
kaggle_X_continuous = pd.DataFrame(kaggle_X_continuous, columns = ContinuousCols)
# delete unscaled cols form original kaggle_X df
kaggle_X = kaggle_X.drop(ContinuousCols, axis=1)

# merge 
kaggle_X = pd.concat([kaggle_X_continuous, kaggle_X], axis=1)
kaggle_X.head()

Unnamed: 0,X,Y,Year,Week,Day,Hour,Time,Id,Month,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,0.917,-1.324,1.732,-0.493,-0.634,1.464,1.56,0,5,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
1,1.236,-1.432,1.732,-0.493,-0.634,1.464,1.54,1,5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,-0.128,1.042,1.732,-0.493,-0.634,1.464,1.537,2,5,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,-0.579,-1.888,1.732,-0.493,-0.634,1.464,1.525,3,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,-0.579,-1.888,1.732,-0.493,-0.634,1.464,1.525,4,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [493]:
# delete the id column for now so that we can run our classifier
ids = kaggle_X['Id']
del kaggle_X['Id']


xgb = XGBClassifier(objective = 'multi:softprob', max_depth = 6, learning_rate = 1.0, max_delta_step = 1, seed=0)
xgb.fit(X, y)
predictions = pd.DataFrame(xgb.predict_proba(kaggle_X), columns=xgb.classes_)

# grid search below
'''
xgb = XGBClassifier()

from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth': np.arange(3, 12)}
grid = GridSearchCV(xgb, param_grid, n_jobs=4)
grid.fit(X, y)
print grid.grid_scores_
print grid.best_score_
print grid.best_estimator_
print grid.best_params_
'''

"\nxgb = XGBClassifier()\n\nfrom sklearn.grid_search import GridSearchCV\nparam_grid = {'max_depth': np.arange(3, 12)}\ngrid = GridSearchCV(xgb, param_grid, n_jobs=4)\ngrid.fit(X, y)\nprint grid.grid_scores_\nprint grid.best_score_\nprint grid.best_estimator_\nprint grid.best_params_\n"

In [494]:
predictions.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,7.672e-05,0.029,1.233e-06,1.332e-05,0.04882,0.001634,0.0006496,0.003,0.0001062,0.0004015,3.625e-07,1.232e-05,1.749e-05,0.001,1.386e-07,5.026e-05,0.272,3.866e-05,1.967e-07,0.004,0.014,0.142,4.016e-11,1.496e-06,4.133e-07,0.006,6.315e-05,0.001261,0.0003216,2e-05,0.01,3.775e-05,0.017,3.471e-08,0.0004833,0.093,0.343,0.011,0.001
1,0.000212,0.032,5.737e-07,4.942e-06,0.0009729,0.002152,0.003229,0.082,8.825e-05,0.0001645,4.132e-07,4.903e-07,0.0001397,0.005,1.898e-06,2e-05,0.012,0.0003482,7.655e-07,0.002,0.028,0.697,2.966e-11,3.678e-06,2.043e-06,0.021,6.262e-07,0.0003876,0.000148,4.177e-06,0.002,7.073e-05,0.029,1.377e-07,5.213e-05,0.007,0.016,0.047,0.012
2,0.006842,0.258,5.88e-07,3.578e-07,0.0456,0.0003338,0.0002209,0.018,0.002121,6.991e-07,4.992e-07,3.063e-07,4.269e-05,0.007,9.779e-07,0.003115,0.17,5.552e-05,7.904e-08,0.014,0.064,0.076,3.726e-10,8.438e-06,6.821e-08,0.033,2.239e-06,0.01387,0.01419,2.383e-08,0.003,8.365e-06,0.053,2.91e-09,0.005392,0.112,0.032,0.038,0.03
3,0.00678,0.088,7.714e-08,3.492e-06,0.02256,0.003158,0.0007456,0.074,0.02317,3.99e-06,4.009e-06,6.122e-05,0.0003078,0.003,3.632e-06,0.004053,0.119,0.002345,4.006e-07,0.021,0.093,0.069,4.139e-10,4.111e-05,6.914e-06,0.104,0.00024,0.0107,0.005638,1.043e-06,0.003,1.698e-06,0.062,2.5e-07,0.004884,0.086,0.148,0.031,0.014
4,0.00678,0.088,7.714e-08,3.492e-06,0.02256,0.003158,0.0007456,0.074,0.02317,3.99e-06,4.009e-06,6.122e-05,0.0003078,0.003,3.632e-06,0.004053,0.119,0.002345,4.006e-07,0.021,0.093,0.069,4.139e-10,4.111e-05,6.914e-06,0.104,0.00024,0.0107,0.005638,1.043e-06,0.003,1.698e-06,0.062,2.5e-07,0.004884,0.086,0.148,0.031,0.014


In [495]:
# put the id column back
predictions = pd.concat([ids, predictions], axis=1)
predictions.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,7.672e-05,0.029,1.233e-06,1.332e-05,0.04882,0.001634,0.0006496,0.003,0.0001062,0.0004015,3.625e-07,1.232e-05,1.749e-05,0.001,1.386e-07,5.026e-05,0.272,3.866e-05,1.967e-07,0.004,0.014,0.142,4.016e-11,1.496e-06,4.133e-07,0.006,6.315e-05,0.001261,0.0003216,2e-05,0.01,3.775e-05,0.017,3.471e-08,0.0004833,0.093,0.343,0.011,0.001
1,1,0.000212,0.032,5.737e-07,4.942e-06,0.0009729,0.002152,0.003229,0.082,8.825e-05,0.0001645,4.132e-07,4.903e-07,0.0001397,0.005,1.898e-06,2e-05,0.012,0.0003482,7.655e-07,0.002,0.028,0.697,2.966e-11,3.678e-06,2.043e-06,0.021,6.262e-07,0.0003876,0.000148,4.177e-06,0.002,7.073e-05,0.029,1.377e-07,5.213e-05,0.007,0.016,0.047,0.012
2,2,0.006842,0.258,5.88e-07,3.578e-07,0.0456,0.0003338,0.0002209,0.018,0.002121,6.991e-07,4.992e-07,3.063e-07,4.269e-05,0.007,9.779e-07,0.003115,0.17,5.552e-05,7.904e-08,0.014,0.064,0.076,3.726e-10,8.438e-06,6.821e-08,0.033,2.239e-06,0.01387,0.01419,2.383e-08,0.003,8.365e-06,0.053,2.91e-09,0.005392,0.112,0.032,0.038,0.03
3,3,0.00678,0.088,7.714e-08,3.492e-06,0.02256,0.003158,0.0007456,0.074,0.02317,3.99e-06,4.009e-06,6.122e-05,0.0003078,0.003,3.632e-06,0.004053,0.119,0.002345,4.006e-07,0.021,0.093,0.069,4.139e-10,4.111e-05,6.914e-06,0.104,0.00024,0.0107,0.005638,1.043e-06,0.003,1.698e-06,0.062,2.5e-07,0.004884,0.086,0.148,0.031,0.014
4,4,0.00678,0.088,7.714e-08,3.492e-06,0.02256,0.003158,0.0007456,0.074,0.02317,3.99e-06,4.009e-06,6.122e-05,0.0003078,0.003,3.632e-06,0.004053,0.119,0.002345,4.006e-07,0.021,0.093,0.069,4.139e-10,4.111e-05,6.914e-06,0.104,0.00024,0.0107,0.005638,1.043e-06,0.003,1.698e-06,0.062,2.5e-07,0.004884,0.086,0.148,0.031,0.014


In [496]:
predictions.to_csv('submission11.csv',index=False)