In [18]:
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
%matplotlib inline
le = LabelEncoder()

The first thing I need to do is import the libraries I will use. The ones I use most are pandas and XGBoost. Pandas reads in the files and converts them into a dataframe, while XGBoost is what I use to write my predictions program

In [21]:
test = pd.read_csv('SF_crime/test.csv', index_col='Id')
test = test.rename(columns={'X': 'Longitude', "Y": "Latitude"})
test.Dates = pd.to_datetime(test.Dates)
test['month'] = test.Dates.dt.month
test['time'] = test.Dates.dt.time
test['day'] = test.Dates.dt.day
test['year'] = test.Dates.dt.year
test = test.drop(['Dates'], axis=1)

crime_in_sf = pd.read_csv('SF_crime/train.csv')
crime_in_sf.Dates = pd.to_datetime(crime_in_sf.Dates)
crime_in_sf = crime_in_sf.rename(columns={'X': 'Longitude', "Y": "Latitude",})
crime_in_sf = crime_in_sf.drop(['Resolution', 'Descript'], axis=1)
crime_in_sf['month'] = crime_in_sf.Dates.dt.month
crime_in_sf['time'] = crime_in_sf.Dates.dt.time
crime_in_sf['day'] = crime_in_sf.Dates.dt.day
crime_in_sf['year'] = crime_in_sf.Dates.dt.year
crime_in_sf = crime_in_sf.drop(['Dates'], axis=1)
crime_train, crime_test = train_test_split(crime_in_sf, test_size=.4)

The next thing I have to do is read in all the files and make any corrections to them so I can make them more readable. I change some column names (X and Y) so that they are easier to read and convert the Dates column to a datetime format so I can pull out individual years or days if I need too. I also drop two columns off of the training data as they don't influence my predictions.

In [23]:
for column in test.columns.values:
    if column != 'Longitude' and column != 'Latitude' and column != 'month' and column != 'day' and column != 'year':
        le.fit(test[column])
        test[column] = le.transform(test[column])

for column in crime_in_sf.columns.values:
    if column != 'Longitude' and column != 'Latitude' and column != 'month' and column != 'day' and column != 'year':
        le.fit(crime_in_sf[column])
        crime_train[column] = le.transform(crime_train[column])

for column in crime_in_sf.columns.values:
    if column != 'Longitude' and column != 'Latitude' and column != 'month' and column != 'day' and column != 'year':
        le.fit(crime_in_sf[column])        
        crime_test[column] = le.transform(crime_test[column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Here I take the information, except for the latitude and longitude, and convert it from strings into integers. Each one is in a dictionary and stored so that they can be converted back later.

In [24]:
categories = crime_train.Category
crime_train = crime_train.drop('Category', axis=1)

categories2 = crime_test.Category
crime_test = crime_test.drop('Category', axis=1)

To properly train my data I needed to make the categories of crimes seperate from the rest of the data.

In [25]:
dtrain = xgb.DMatrix(crime_train.as_matrix(),
                     label=categories)
dtest = xgb.DMatrix(crime_test.as_matrix(),
                    label=categories2)

Now that the data is separated it needs to be prepared to be put into the decision tree. The first thing that needs to happen is that the information is converted from a pandas table into a matrix, and the categories need to be added in their own identifier so that the program knows what it's predicting on.

In [26]:
param = {'bst:max_depth':8, 'objective':'multi:softprob', 'num_class':39}
param['nthread'] = 6
param['eval_metric'] = ['merror', 'mlogloss']
evallist  = [(dtrain,'train'), (dtest,'eval')]
num_round = 500

Lastly I need to tell the program how it's suppose to wrong and what it should use to evaluate the information. I set how large of a tree I want (the max_depth), what I want it to return (softprob), how many categories it should be in. 
I also set up the evaluation metrics that it would run on.

In [27]:
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=3)

Will train until eval error hasn't decreased in 3 rounds.
Multiple eval metrics have been passed: 'mlogloss' will be used for early stopping.

[0]	train-merror:0.724996	train-mlogloss:3.040613	eval-merror:0.731670	eval-mlogloss:3.055890
[1]	train-merror:0.715950	train-mlogloss:2.846959	eval-merror:0.724321	eval-mlogloss:2.872441
[2]	train-merror:0.711039	train-mlogloss:2.721652	eval-merror:0.720862	eval-mlogloss:2.754747
[3]	train-merror:0.707190	train-mlogloss:2.629603	eval-merror:0.718521	eval-mlogloss:2.669783
[4]	train-merror:0.704069	train-mlogloss:2.558676	eval-merror:0.716810	eval-mlogloss:2.605549
[5]	train-merror:0.701843	train-mlogloss:2.504014	eval-merror:0.716004	eval-mlogloss:2.557234
[6]	train-merror:0.699299	train-mlogloss:2.457861	eval-merror:0.714771	eval-mlogloss:2.517760
[7]	train-merror:0.696993	train-mlogloss:2.420587	eval-merror:0.713749	eval-mlogloss:2.486826
[8]	train-merror:0.695322	train-mlogloss:2.390066	eval-merror:0.712892	eval-mlogloss:2.461550
[9]	train-m

And here is where the program trains. As you can see the numbers are getting smaller as they go along, showing that it is getting more accurate. This will hopefully give me a better prediction.

In [28]:
predictions = bst.predict(xgb.DMatrix(test.as_matrix()), output_margin=False)

Now that the model is trained I convert the data I will actually predict upon into a matrix and run it through the model I just created and it returns it's predictions based off of all the descisions it had to make

In [29]:
predictions = pd.DataFrame(predictions)

I then put those predictions back into a DataFrame. I can easily use that to look over my data and see what it looks like. This is a good time to see if there are any trends or problems that may arise.

In [30]:
le.fit(crime_in_sf.Category)
predictions.columns = le.inverse_transform(predictions.columns)

I also relabel the information so that it has what the crimes are as opposed to simply numbers from 0-38 so that I know what the crimes that it is predicting on are

In [31]:
crime_in_sf.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,Address,Longitude,Latitude,month,time,day,year
0,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,5,23:53:00,13,2015
1,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,5,23:53:00,13,2015
2,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,23:33:00,13,2015
3,LARCENY/THEFT,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873,5,23:30:00,13,2015
4,LARCENY/THEFT,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541,5,23:30:00,13,2015


This is what my data looked like when I fed it into my program

In [32]:
predictions.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0.00029,0.043021,5e-06,2.6e-05,0.022261,0.000956,0.003738,0.003035,0.000676,1.9e-05,...,1e-06,0.001426,4e-06,0.024306,9.049518e-07,0.007719,0.04112,0.51787,0.01745,0.00801
1,0.00092,0.08939,3e-06,3.5e-05,0.003622,0.002782,0.004613,0.013617,0.002355,4e-06,...,3e-06,0.004976,3e-06,0.043104,3.053812e-06,0.001181,0.020824,0.033738,0.056279,0.037096
2,0.007583,0.087087,4.9e-05,6e-06,0.052787,0.001341,0.001715,0.043414,0.002997,0.000106,...,2e-06,0.00745,2.4e-05,0.056168,4.489783e-07,0.00299,0.091619,0.024867,0.024484,0.007925
3,0.002836,0.30786,6e-06,3.6e-05,0.017551,0.001852,0.00223,0.013117,0.001102,6e-06,...,8e-06,0.005339,4e-06,0.100644,4.561365e-06,0.00768,0.044388,0.033044,0.022216,0.05413
4,0.002836,0.30786,6e-06,3.6e-05,0.017551,0.001852,0.00223,0.013117,0.001102,6e-06,...,8e-06,0.005339,4e-06,0.100644,4.561365e-06,0.00768,0.044388,0.033044,0.022216,0.05413


And this is what my predictions look like. Several categories which numbers to identify them and a probabilities of their likelyhood for each type of crime.

In [33]:
predictions['Id'] = predictions.index

def order(frame,var):
    varlist =[w for w in frame.columns if w not in var]
    frame = frame[var+varlist]
    return frame

predictions = order(predictions,['Id'])

I used a small definition here to add an ID column and put it on the front of my data so that it could be easily identified for the competition, and then I simply run the panel below and create a file which I can submit.