In [10]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from patsy import dmatrix
from sklearn import cross_validation
from sklearn.cross_validation import KFold

In [21]:
cur_dir = os.path.dirname('__file__')

# train=pd.read_csv('train.csv', parse_dates = ['Dates'])
# test=pd.read_csv('test.csv', parse_dates = ['Dates'])

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [5]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Year'] = datetimes.apply(lambda dt: dt.year)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    le_crime = preprocessing.LabelEncoder()
    crime = le_crime.fit_transform(res.Category)
    
    res['crime'] = crime
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [22]:
# train = preprocess(get_random_subset(train, 100000))
train = preprocess(train)

In [7]:
print train_df.head()

                      Dates        Category  \
747525  2004-09-30 15:30:00   VEHICLE THEFT   
177062  2013-01-09 12:00:00  OTHER OFFENSES   
286453  2011-06-15 10:00:00   LARCENY/THEFT   
365332  2010-04-11 12:00:00    NON-CRIMINAL   
356851  2010-06-01 09:45:00  OTHER OFFENSES   

                                            Descript  DayOfWeek  PdDistrict  \
747525                             STOLEN AUTOMOBILE   Thursday     CENTRAL   
177062  FALSE PERSONATION AND CHEAT CRIMES (GENERAL)  Wednesday     BAYVIEW   
286453                   PETTY THEFT FROM A BUILDING  Wednesday   INGLESIDE   
365332                                  CASE CLOSURE     Sunday  TENDERLOIN   
356851                         HARASSING PHONE CALLS    Tuesday    SOUTHERN   

       Resolution                    Address           X          Y  Hour  \
747525       NONE        TAYLOR ST / BUSH ST -122.412079  37.789739    15   
177062       NONE  900 Block of WISCONSIN ST -122.398676  37.755284    12   
286453     

In [27]:
training, validation = train_test_split(train, train_size=.60)
# features = ['DayOfWeek', 'PdDistrict'+X+Y+C(Hour)+C(Month)+C(Year)+Street_Corner']
formula_ml = 'C(DayOfWeek)+C(PdDistrict)+X+Y+C(Hour)+C(Month)+C(Year)+Street_Corner'
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner'
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_train = training['crime']

alg = RandomForestClassifier()
alg = BernoulliNB()

alg.fit(x_train, y_train)
predictions = np.array(alg.predict_proba(x_validation))
print predictions.shape

print log_loss(validation['crime'], predictions)

(351193, 39)
2.62269121691


In [None]:
train = get_random_subset(train)

#Convert crime labels to numbers
le_crime = preprocessing.LabelEncoder()
crime = le_crime.fit_transform(train.Category)
 
#Get binarized weekdays, districts, and hours.
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour) 
 
#Build new array
train_data = pd.concat([hour, days, district], axis=1)
train_data['crime']=crime
 
#Repeat for test data
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
 
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour) 
 
test_data = pd.concat([hour, days, district], axis=1)
 
training, validation = train_test_split(train_data, train_size=.60)