In [457]:
# this will eventually be taken out of a ipython notebook and into a .py file, so don't worry about making it pretty. 
# any data exploration done here will be transferred to the data exploration file.

In [458]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt

# enables inline plots
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 3)

df = pd.read_csv('/Users/eloiserosen/Downloads/train.csv')
df_test = pd.read_csv('/Users/eloiserosen/Downloads/test.csv')

#df = pd.read_csv('/Users/eloiserosen/Downloads/train_small_10k.csv')
#df_test = pd.read_csv('/Users/eloiserosen/Downloads/test_small_10k.csv')
#del df['Unnamed: 0']
#del df_test['Unnamed: 0']

In [459]:
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.426,37.775
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.426,37.775
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424,37.8
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.427,37.801
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.439,37.772


In [460]:
df_test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.4,37.735
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.392,37.732
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426,37.792
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437,37.721
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437,37.721


In [461]:
# code to generate small versions of files

'''
df_small = df.tail(10000)
df_small.to_csv('train_small_10k.csv')

df_test_small = df_test.tail(10000)
df_test_small.to_csv('test_small_10k.csv')
'''


"\ndf_small = df.tail(10000)\ndf_small.to_csv('train_small_10k.csv')\n\ndf_test_small = df_test.tail(10000)\ndf_test_small.to_csv('test_small_10k.csv')\n"

In [462]:
def clean_data(df):
    feature_list=df.columns.tolist()
    
    # drop columns we don't need
    if 'Descript' in feature_list:
        del df['Descript']
    if 'Resolution' in feature_list:
        del df['Resolution']
    
    # create columns based on timestamp
    date_time = pd.to_datetime(df['Dates'])
    year = date_time.dt.year
    df['Year'] = year
    month = date_time.dt.month
    df['Month'] = month
    week = date_time.dt.week
    df['Week'] = week
    day = date_time.dt.day
    df['Day'] = day
    hour = date_time.dt.hour
    df['Hour'] = hour
    minute = date_time.dt.minute
    time = hour*60+minute # counting minutes
    df['Time'] = time
    del df['Dates']
    
    # column to indicate if address was on a block
    df['StreetCorner'] = df['Address'].str.contains('/').map(int)
    
    # drop remaining address info for now
    del df['Address']
    
    return df
    

    

In [463]:
df = clean_data(df)
df.head()


Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner
0,WARRANTS,Wednesday,NORTHERN,-122.426,37.775,2015,5,20,13,23,1433,1
1,OTHER OFFENSES,Wednesday,NORTHERN,-122.426,37.775,2015,5,20,13,23,1433,1
2,OTHER OFFENSES,Wednesday,NORTHERN,-122.424,37.8,2015,5,20,13,23,1413,1
3,LARCENY/THEFT,Wednesday,NORTHERN,-122.427,37.801,2015,5,20,13,23,1410,0
4,LARCENY/THEFT,Wednesday,PARK,-122.439,37.772,2015,5,20,13,23,1410,0


In [464]:
number_categories = df.Category.nunique()
print number_categories

39


In [465]:
df_test = clean_data(df_test)
df_test.head()

Unnamed: 0,Id,DayOfWeek,PdDistrict,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner
0,0,Sunday,BAYVIEW,-122.4,37.735,2015,5,19,10,23,1439,0
1,1,Sunday,BAYVIEW,-122.392,37.732,2015,5,19,10,23,1431,1
2,2,Sunday,NORTHERN,-122.426,37.792,2015,5,19,10,23,1430,0
3,3,Sunday,INGLESIDE,-122.437,37.721,2015,5,19,10,23,1425,0
4,4,Sunday,INGLESIDE,-122.437,37.721,2015,5,19,10,23,1425,0


In [466]:
# not sure why this is being such a cunt when i try to put it in the clean_data def

# set up dummies
dummy_DayOfWeek = pd.get_dummies(df['DayOfWeek'], prefix='Day')
del dummy_DayOfWeek['Day_Friday']
del df['DayOfWeek']
df = df.join(dummy_DayOfWeek)
dummy_PdDistrict = pd.get_dummies(df['PdDistrict'], prefix='District')
del dummy_PdDistrict['District_SOUTHERN']
del df['PdDistrict']
df = df.join(dummy_PdDistrict)


dummy_DayOfWeek = pd.get_dummies(df_test['DayOfWeek'], prefix='Day')
del dummy_DayOfWeek['Day_Friday']
del df_test['DayOfWeek']
df_test = df_test.join(dummy_DayOfWeek)
dummy_PdDistrict = pd.get_dummies(df_test['PdDistrict'], prefix='District')
del dummy_PdDistrict['District_SOUTHERN']
del df_test['PdDistrict']
df_test = df_test.join(dummy_PdDistrict)

In [467]:
df.head()

Unnamed: 0,Category,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,WARRANTS,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,OTHER OFFENSES,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,OTHER OFFENSES,-122.424,37.8,2015,5,20,13,23,1413,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,LARCENY/THEFT,-122.427,37.801,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,LARCENY/THEFT,-122.439,37.772,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [468]:
df_test.head()

Unnamed: 0,Id,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,0,-122.4,37.735,2015,5,19,10,23,1439,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
1,1,-122.392,37.732,2015,5,19,10,23,1431,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,2,-122.426,37.792,2015,5,19,10,23,1430,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,3,-122.437,37.721,2015,5,19,10,23,1425,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,4,-122.437,37.721,2015,5,19,10,23,1425,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [469]:
# as noted in data exploration file, there are some latitude and longitude values that are obviously incorrect. 
#Impute these with the median.

# fill incorrect values with NaN
df['X'].replace(-120.5, np.nan, inplace = True)
df['Y'].replace(90, np.nan, inplace = True)

# find median for median imputation. Save values so I can reuse for test file.
medianX = df['X'].median()
medianY = df['Y'].median()

# median imputation
df['X'] = df['X'].fillna(medianX)
df['Y'] = df['Y'].fillna(medianY)

In [470]:
#median imputation in test file
# fill incorrect values with NaN
df_test['X'].replace(-120.5, np.nan, inplace = True)
df_test['Y'].replace(90, np.nan, inplace = True)

# median imputation
df_test['X'] = df_test['X'].fillna(medianX)
df_test['Y'] = df_test['Y'].fillna(medianY)

## Target Vector and Feature Matrix

In [471]:
#target vector y
y = df['Category']
y.head()

0          WARRANTS
1    OTHER OFFENSES
2    OTHER OFFENSES
3     LARCENY/THEFT
4     LARCENY/THEFT
Name: Category, dtype: object

In [472]:
#Matrix of X's.
X = df
del X['Category']
X.head()

Unnamed: 0,X,Y,Year,Month,Week,Day,Hour,Time,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,-122.426,37.775,2015,5,20,13,23,1433,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,-122.424,37.8,2015,5,20,13,23,1413,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,-122.427,37.801,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,-122.439,37.772,2015,5,20,13,23,1410,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [473]:
# scale data with zero mean and unit variance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True)  
# make a df called x_continous that has just our continous features
ContinuousCols = ['X','Y', 'Year', 'Week', 'Day', 'Hour', 'Time']
X_continuous = X[ContinuousCols]

# scale to zero mean and unit variance
X_continuous = scaler.fit(X_continuous).transform(X_continuous)
X_continuous = pd.DataFrame(X_continuous, columns = ContinuousCols)

# delete unscaled cols form original X df
X = X.drop(ContinuousCols, axis=1)

# merge 
X = pd.concat([X_continuous, X], axis=1)
X.head()

Unnamed: 0,X,Y,Year,Week,Day,Hour,Time,Month,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,-0.124,0.313,1.732,-0.426,-0.293,1.464,1.545,5,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,-0.124,0.313,1.732,-0.426,-0.293,1.464,1.545,5,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,-0.063,1.381,1.732,-0.426,-0.293,1.464,1.494,5,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,-0.167,1.4,1.732,-0.426,-0.293,1.464,1.486,5,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,-0.632,0.186,1.732,-0.426,-0.293,1.464,1.486,5,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [474]:
# scale test data with zero mean and unit variance as well. Use same scaler object I created on my training data.

kaggle_X = df_test

# make a df called x_continous that has just our continous features
kaggle_X_continuous = kaggle_X[ContinuousCols]
# scale to zero mean and unit variance
kaggle_X_continuous = scaler.transform(kaggle_X_continuous)
kaggle_X_continuous = pd.DataFrame(kaggle_X_continuous, columns = ContinuousCols)
# delete unscaled cols form original kaggle_X df
kaggle_X = kaggle_X.drop(ContinuousCols, axis=1)

# merge 
kaggle_X = pd.concat([kaggle_X_continuous, kaggle_X], axis=1)
kaggle_X.head()

Unnamed: 0,X,Y,Year,Week,Day,Hour,Time,Id,Month,StreetCorner,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,District_BAYVIEW,District_CENTRAL,District_INGLESIDE,District_MISSION,District_NORTHERN,District_PARK,District_RICHMOND,District_TARAVAL,District_TENDERLOIN
0,0.917,-1.324,1.732,-0.493,-0.634,1.464,1.56,0,5,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
1,1.236,-1.432,1.732,-0.493,-0.634,1.464,1.54,1,5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,-0.128,1.042,1.732,-0.493,-0.634,1.464,1.537,2,5,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,-0.579,-1.888,1.732,-0.493,-0.634,1.464,1.525,3,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,-0.579,-1.888,1.732,-0.493,-0.634,1.464,1.525,4,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [341]:
# delete the id column for now so that we can run our classifier
ids = kaggle_X['Id']
del kaggle_X['Id']


xgb = XGBClassifier(objective = 'multi:softprob', max_depth = 6, learning_rate = 1.0, max_delta_step = 1, seed=0)
xgb.fit(X, y)
predictions = pd.DataFrame(xgb.predict_proba(kaggle_X), columns=xgb.classes_)

# grid search below
'''
xgb = XGBClassifier()

from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth': np.arange(3, 12)}
grid = GridSearchCV(xgb, param_grid, n_jobs=4)
grid.fit(X, y)
print grid.grid_scores_
print grid.best_score_
print grid.best_estimator_
print grid.best_params_
'''

"\nxgb = XGBClassifier()\n\nfrom sklearn.grid_search import GridSearchCV\nparam_grid = {'max_depth': np.arange(3, 12)}\ngrid = GridSearchCV(xgb, param_grid, n_jobs=4)\ngrid.fit(X, y)\nprint grid.grid_scores_\nprint grid.best_score_\nprint grid.best_estimator_\nprint grid.best_params_\n"

In [342]:
predictions.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0.000586,0.007,1.307e-06,2.111e-05,0.08178,5.6e-05,0.01108,0.002,1.523e-05,1.718e-05,1.999e-06,6.165e-06,9.761e-06,0.003,1.257e-06,2.406e-05,0.165,1.753e-05,3.68e-08,0.032,0.014,0.155,2.13e-10,4.473e-06,7.275e-08,0.0006502,2.115e-06,0.0005822,0.0001405,1.185e-06,0.0015,1.239e-05,0.013,1.906e-08,0.0002257,0.143,0.367,0.003,0.0001433
1,0.0005181,0.013,5.403e-08,2.743e-07,0.0002353,2.085e-05,0.000913,0.025,5.884e-05,1.183e-07,3.257e-08,2.563e-08,7.33e-05,0.007,2.521e-06,5.666e-07,0.003,2.556e-05,1.053e-06,0.005,0.013,0.82,3.889e-10,9.776e-06,1.806e-07,0.005322,6.927e-08,0.0003406,5.807e-05,2.341e-08,0.01976,1.995e-05,0.027,7.687e-08,4.516e-05,0.008,0.019,0.016,0.01626
2,0.0009094,0.41,2.651e-06,2.095e-06,0.06413,0.0001097,8.214e-05,0.015,0.001013,4.224e-06,2.651e-07,6.385e-06,0.0003983,0.001,1.305e-07,0.0007844,0.198,1.973e-05,9.871e-09,0.004,0.046,0.053,4.844e-09,2.684e-05,2.496e-07,0.02476,7.362e-07,0.004389,0.002623,3.203e-07,0.02058,2.022e-06,0.018,1.429e-08,0.004686,0.08,0.015,0.013,0.02101
3,0.00107,0.064,1.038e-07,2.434e-06,0.006519,0.001349,0.0002646,0.017,0.003379,4.313e-07,2.711e-06,0.001942,0.0008553,0.005,2.58e-07,0.0005649,0.051,5.608e-05,4.388e-07,0.009,0.112,0.068,2.575e-10,1.659e-05,5.784e-07,0.1543,5.403e-05,0.004136,0.0006515,2.513e-08,0.0009194,1.804e-05,0.178,1.687e-08,0.004665,0.057,0.184,0.021,0.0513
4,0.00107,0.064,1.038e-07,2.434e-06,0.006519,0.001349,0.0002646,0.017,0.003379,4.313e-07,2.711e-06,0.001942,0.0008553,0.005,2.58e-07,0.0005649,0.051,5.608e-05,4.388e-07,0.009,0.112,0.068,2.575e-10,1.659e-05,5.784e-07,0.1543,5.403e-05,0.004136,0.0006515,2.513e-08,0.0009194,1.804e-05,0.178,1.687e-08,0.004665,0.057,0.184,0.021,0.0513


In [343]:
# put the id column back
predictions = pd.concat([ids, predictions], axis=1)
predictions.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.000586,0.007,1.307e-06,2.111e-05,0.08178,5.6e-05,0.01108,0.002,1.523e-05,1.718e-05,1.999e-06,6.165e-06,9.761e-06,0.003,1.257e-06,2.406e-05,0.165,1.753e-05,3.68e-08,0.032,0.014,0.155,2.13e-10,4.473e-06,7.275e-08,0.0006502,2.115e-06,0.0005822,0.0001405,1.185e-06,0.0015,1.239e-05,0.013,1.906e-08,0.0002257,0.143,0.367,0.003,0.0001433
1,1,0.0005181,0.013,5.403e-08,2.743e-07,0.0002353,2.085e-05,0.000913,0.025,5.884e-05,1.183e-07,3.257e-08,2.563e-08,7.33e-05,0.007,2.521e-06,5.666e-07,0.003,2.556e-05,1.053e-06,0.005,0.013,0.82,3.889e-10,9.776e-06,1.806e-07,0.005322,6.927e-08,0.0003406,5.807e-05,2.341e-08,0.01976,1.995e-05,0.027,7.687e-08,4.516e-05,0.008,0.019,0.016,0.01626
2,2,0.0009094,0.41,2.651e-06,2.095e-06,0.06413,0.0001097,8.214e-05,0.015,0.001013,4.224e-06,2.651e-07,6.385e-06,0.0003983,0.001,1.305e-07,0.0007844,0.198,1.973e-05,9.871e-09,0.004,0.046,0.053,4.844e-09,2.684e-05,2.496e-07,0.02476,7.362e-07,0.004389,0.002623,3.203e-07,0.02058,2.022e-06,0.018,1.429e-08,0.004686,0.08,0.015,0.013,0.02101
3,3,0.00107,0.064,1.038e-07,2.434e-06,0.006519,0.001349,0.0002646,0.017,0.003379,4.313e-07,2.711e-06,0.001942,0.0008553,0.005,2.58e-07,0.0005649,0.051,5.608e-05,4.388e-07,0.009,0.112,0.068,2.575e-10,1.659e-05,5.784e-07,0.1543,5.403e-05,0.004136,0.0006515,2.513e-08,0.0009194,1.804e-05,0.178,1.687e-08,0.004665,0.057,0.184,0.021,0.0513
4,4,0.00107,0.064,1.038e-07,2.434e-06,0.006519,0.001349,0.0002646,0.017,0.003379,4.313e-07,2.711e-06,0.001942,0.0008553,0.005,2.58e-07,0.0005649,0.051,5.608e-05,4.388e-07,0.009,0.112,0.068,2.575e-10,1.659e-05,5.784e-07,0.1543,5.403e-05,0.004136,0.0006515,2.513e-08,0.0009194,1.804e-05,0.178,1.687e-08,0.004665,0.057,0.184,0.021,0.0513


In [344]:
predictions.to_csv('submission9.csv',index=False)