# Model Iteration 1
Here, I want to implement a very simple model. To do this, I'm not going to create very many features, just recode some of the variables that are categorical like district. 

## Importing Everything!

In [52]:
import shapefile
import pandas as pd
import numpy as np
import itertools
import re

import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib import cm
from datetime import datetime
from ipywidgets import widgets  
from IPython.display import display


from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import log_loss


## Loading in the data
Now, I need to load in the data

In [82]:
readData = pd.read_csv('train.csv')
Categories = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
              'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
              'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
              'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
              'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
              'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
              'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
              'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
              'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
              'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
              'WARRANTS', 'WEAPON LAWS']

## Helper functions for recoding data
Here are the helper functions for recoding data. We'll add more as we create some new features

In [83]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    newLatLon = removeOutlierLatLon(df)
    newDate = recodeDates(df)
    newDistrict = recodePoliceDistricts(df)
    newAddress = recodeAddresses(df)

    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newDistrict 
    addedColumns += newAddress
    addedColumns += newLatLon
    
    
   

    if (isTrain):
        newCategory = recodeCategories(df)
        addedColumns += newCategory
        try: #prevents error if the coumns have already been removed or we are processing test data
            columnsToDrop = ['Descript', 'Resolution']
            df.drop(columnsToDrop, axis=1, inplace=True)
        except:
            print "already recoded or using test data"
         

    return df, addedColumns

In [84]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())
    #df['MinuteOfWeek'] = df['DateTime'].apply(lambda x: x.weekday()*24*60 + x.hour*60 + x.minute)

    return ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode']

In [85]:
def recodePoliceDistricts(df):
    '''This function recodes the police district to a one-hot encoding 
    scheme.'''
    districts = df['PdDistrict'].unique().tolist()
    newColumns = []
    for district in districts:
        newColumns.append('District' + district)
        df['District' + district] = df['PdDistrict'].apply(
            lambda x: int(x == district))

    return newColumns

In [86]:
def recodeAddresses(df):
    '''This function will attempt to create some features related to the address field in the database. To do this, 
    first, we need to split up the address field into two different address fields'''
    
    #If there are two addresss, split fields. Also extract the block number
    df['Address1'] = df['Address'].apply(lambda x: re.sub(r'^\d+ Block of ','',x.split(" / ")[0]))
    df['Address2'] = df['Address'].apply(lambda x: (x.split(" / ")[1]) if (len(x.split(" / ")) > 1) else '')
    
    streets = set(df['Address1'].unique().tolist() + df['Address2'].unique().tolist())
    
    for street in streets:
        df['OnStreet' + street] = df.apply(lambda x: (x['Address1'] == street or x['Address2'] == street), axis=1)

    
    df['BlockNumber'] = df['Address'].apply(lambda x: int(re.findall(r'^\d+',x)[0]) if (len(re.findall(r'^\d+',x)) > 0) else None )
    df['BlockNumber'] = df['BlockNumber'].fillna(-1)
    
    #Also add the "did the crime occur on a street corner field?"
    df['StreetCornerFlag'] = df['Address'].apply(lambda x: len(x.split(" / ")) > 1)
    
    return ['StreetCornerFlag', 'BlockNumber']

In [87]:
def removeOutlierLatLon(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    df.loc[df.X > -121, 'X'] = df.loc[(df.X > -121)].apply(lambda row: df.X[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    df.loc[df.Y > 38, 'Y'] = df.loc[(df.Y > 38)].apply(lambda row: df.Y[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)

    return ['X', 'Y']

In [88]:
def recodeCategories(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    #if 'Category' in df.columns:
    df['CategoryRecode'] = df.Category.apply(lambda x: Categories.index(x))
        
    return ['CategoryRecode']

## Recoding Columns
Here, we want to do some recoding of the columns. To do this, we're going to use our handy-dandy helper functions.  

In [None]:
crimeData, addedColumns = recodeData(
    readData, isTrain = True)
crimeData.describe()
# crimeData

## Model Iteration 1
Now that I've done some recoding, I'm going to create my model. To do this, I'm going to do a random forest classifier. 

In [27]:
columnsToUse = addedColumns

columnsToUse = ['X','Y', 'Year', 'Month', 'Hour', 'Minute',
       'DayOfWeekRecode', 'DistrictNORTHERN', 'DistrictPARK',
       'DistrictINGLESIDE', 'DistrictBAYVIEW', 'DistrictRICHMOND',
       'DistrictCENTRAL', 'DistrictTARAVAL', 'DistrictTENDERLOIN',
       'DistrictMISSION', 'DistrictSOUTHERN', 'StreetCornerFlag', 'BlockNumber']

X = crimeData[columnsToUse]
y = crimeData['CategoryRecode']

clf = RandomForestClassifier(n_estimators=30, max_depth = 7, max_leaf_nodes = 100, random_state=1)

k_folds = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)

scores = []

for k, (train, test) in enumerate(k_folds):
    clf.fit(X.iloc[train], y.iloc[train])
    probs = clf.predict_proba(X.iloc[test])
    score = log_loss(y.iloc[test], probs)
    print score
    scores.append(score)
    
print(scores)
print("Average: " + str(np.average(scores)))

2.44435886181
2.44526498167
2.44227763915
[2.4443588618085208, 2.445264981674431, 2.4422776391516283]
Average: 2.44396716088


In [28]:
def make_submission(clf, predictors, path='my_submission.csv'):
    '''This function will take in a trained model, a list of predictors, and an optional 
    filepath and create a submissision file for us.'''
   
    test_data = pd.read_csv('test.csv')
    
    test_data, newColumns = recodeData(test_data)
    
    #clf.fit(trainX[predictors], trainY)
    predictions = clf.predict_proba(test_data[predictors])

    submission = pd.DataFrame({
        'Id': test_data.Id
    })
    
    for i in range(predictions.shape[1]):
        submission[Categories[i]] = predictions[:,i]
    submission.to_csv(path, index=False)

    print(" -- Wrote submission to file {}.".format(path))


In [29]:
clf.fit(X, y)
print "model fitted with all data"

model fitted with all data


In [30]:
make_submission(clf, columnsToUse)

 -- Wrote submission to file my_submission.csv.


Best Score: 	
2.44156

In [49]:
countsByStreet = crimeData.groupby(['Category','Address1']).count()
countsByStreet[countsByStreet.Dates > 100]

Unnamed: 0_level_0,Unnamed: 1_level_0,Dates,DayOfWeek,PdDistrict,Address,X,Y,DateTime,Year,Month,Day,...,DistrictRICHMOND,DistrictCENTRAL,DistrictTARAVAL,DistrictTENDERLOIN,DistrictMISSION,DistrictSOUTHERN,Address2,BlockNumber,StreetCornerFlag,CategoryRecode
Category,Address1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ASSAULT,10TH ST,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,27,106,106,106
ASSAULT,11TH ST,151,151,151,151,151,151,151,151,151,151,...,151,151,151,151,151,151,70,151,151,151
ASSAULT,14TH ST,134,134,134,134,134,134,134,134,134,134,...,134,134,134,134,134,134,55,134,134,134
ASSAULT,15TH ST,162,162,162,162,162,162,162,162,162,162,...,162,162,162,162,162,162,51,162,162,162
ASSAULT,16TH ST,991,991,991,991,991,991,991,991,991,991,...,991,991,991,991,991,991,423,991,991,991
ASSAULT,17TH ST,293,293,293,293,293,293,293,293,293,293,...,293,293,293,293,293,293,130,293,293,293
ASSAULT,18TH ST,571,571,571,571,571,571,571,571,571,571,...,571,571,571,571,571,571,188,571,571,571
ASSAULT,19TH AV,189,189,189,189,189,189,189,189,189,189,...,189,189,189,189,189,189,90,189,189,189
ASSAULT,19TH ST,178,178,178,178,178,178,178,178,178,178,...,178,178,178,178,178,178,83,178,178,178
ASSAULT,20TH AV,157,157,157,157,157,157,157,157,157,157,...,157,157,157,157,157,157,21,157,157,157
