In [None]:
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
%matplotlib inline
le = LabelEncoder()

The first thing I need to do is import the libraries I will use. The ones I use most are pandas and XGBoost. Pandas reads in the files and converts them into a dataframe, while XGBoost is what I use to write my predictions program

In [None]:
test = pd.read_csv('SF_crime/test.csv', index_col='Id')
test = test.rename(columns={'X': 'Longitude', "Y": "Latitude"})
test.Dates = pd.to_datetime(test.Dates)
test['month'] = test.Dates.dt.month
test['time'] = test.Dates.dt.time
test['day'] = test.Dates.dt.day
test = test.drop(['Dates'], axis=1)
test_keep = test
crime_in_sf = pd.read_csv('SF_crime/train.csv')
crime_in_sf.Dates = pd.to_datetime(crime_in_sf.Dates)
crime_in_sf = crime_in_sf.rename(columns={'X': 'Longitude', "Y": "Latitude",})
crime_in_sf = crime_in_sf.drop(['Resolution', 'Descript'], axis=1)
crime_in_sf['month'] = crime_in_sf.Dates.dt.month
crime_in_sf['time'] = crime_in_sf.Dates.dt.time
crime_in_sf['day'] = crime_in_sf.Dates.dt.day
crime_in_sf = crime_in_sf.drop(['Dates'], axis=1)
crime_train, crime_test = train_test_split(crime_in_sf, test_size=.4)

The next thing I have to do is read in all the files and make any corrections to them so I can make them more readable. I change some column names (X and Y) so that they are easier to read and convert the Dates column to a datetime format so I can pull out individual years or days if I need too. I also drop two columns off of the training data as they don't influence my predictions.

In [None]:
for column in test.columns.values:
    if column != 'Longitude' and column != 'Latitude' and column != 'month' and column != 'day':
        le.fit(test[column])
        test[column] = le.transform(test[column])

for column in crime_in_sf.columns.values:
    if column != 'Longitude' and column != 'Latitude' and column != 'month' and column != 'day':
        le.fit(crime_in_sf[column])
        crime_train[column] = le.transform(crime_train[column])

for column in crime_in_sf.columns.values:
    if column != 'Longitude' and column != 'Latitude' and column != 'month' and column != 'day':
        le.fit(crime_in_sf[column])        
        crime_test[column] = le.transform(crime_test[column])

Here I take the information, except for the latitude and longitude, and convert it from strings into integers. Each one is in a dictionary and stored so that they can be converted back later.

In [None]:
categories = crime_train.Category
crime_train = crime_train.drop('Category', axis=1)

categories2 = crime_test.Category
crime_test = crime_test.drop('Category', axis=1)

To properly train my data I needed to make the categories of crimes seperate from the rest of the data.

In [None]:
dtrain = xgb.DMatrix(crime_train.as_matrix(),
                     label=categories)
dtest = xgb.DMatrix(crime_test.as_matrix(),
                    label=categories2)

Now that the data is separated it needs to be prepared to be put into the decision tree. The first thing that needs to happen is that the information is converted from a pandas table into a matrix, and the categories need to be added in their own identifier so that the program knows what it's predicting on.

In [None]:
param = {'bst:max_depth':8, 'objective':'multi:softprob', 'num_class':39}
param['nthread'] = 6
param['eval_metric'] = ['mlogloss']
evallist  = [(dtest,'eval'), (dtrain,'train')]
num_round = 500

Lastly I need to tell the program how it's suppose to wrong and what it should use to evaluate the information. I set how large of a tree I want (the max_depth), what I want it to return (softprob), how many categories it should be in. 
I also set up the evaluation metrics that it would run on.

In [None]:
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=3)

And here is where the program trains. As you can see the numbers are getting smaller as they go along, showing that it is getting more accurate. This will hopefully give me a better prediction.

In [None]:
predictions = bst.predict(xgb.DMatrix(test.as_matrix()), output_margin=False)

Now that the model is trained I convert the data I will actually predict upon into a matrix and run it through the model I just created and it returns it's predictions based off of all the descisions it had to make

In [None]:
predictions = pd.DataFrame(predictions)

I then put those predictions back into a DataFrame. I can easily use that to look over my data and see what it looks like. This is a good time to see if there are any trends or problems that may arise.

In [None]:
le.fit(crime_in_sf.Category)
predictions.columns = le.inverse_transform(predictions.columns)

I also relabel the information so that it has what the crimes are as opposed to simply numbers from 0-38 so that I know what the crimes that it is predicting on are

In [None]:
crime_in_sf.head()

This is what my data looked like when I fed it into my program

In [None]:
predictions.head()

And this is what my predictions look like. Several categories which numbers to identify them and a probabilities of their likelyhood for each type of crime.

In [None]:
predictions['Id'] = predictions.index

def order(frame,var):
    varlist =[w for w in frame.columns if w not in var]
    frame = frame[var+varlist]
    return frame

predictions = order(predictions,['Id'])

I used a small definition here to add an ID column and put it on the front of my data so that it could be easily identified for the competition, and then I simply run the panel below and create a file which I can submit.

In [None]:
predictions.to_csv('/Users/MatthewBarnette/final_project_predictions//predictions_XGB_280.csv', index=False)