In [1]:
from datetime import datetime
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

In [3]:
import random

n = 878049 # number of rows in the file
s = 10000 # desired sample size
skip = sorted(random.sample(xrange(1, n), n-s))

In [4]:
train = pd.read_csv('../data/train.csv.gz', parse_dates=True,
                    skiprows=skip)

In [5]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 19:28:00,VEHICLE THEFT,STOLEN AND RECOVERED VEHICLE,Wednesday,CENTRAL,NONE,0 Block of SANSOME ST,-122.40072,37.790712
1,2015-05-12 20:00:00,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Tuesday,CENTRAL,NONE,200 Block of POST ST,-122.406,37.788541
2,2015-05-12 17:00:00,OTHER OFFENSES,FALSE PERSONATION,Tuesday,MISSION,NONE,900 Block of ELIZABETH ST,-122.441742,37.751689
3,2015-05-12 15:23:00,ASSAULT,BATTERY,Tuesday,MISSION,NONE,2300 Block of 16TH ST,-122.40953,37.765718
4,2015-05-12 14:15:00,NON-CRIMINAL,AIDED CASE -PROPERTY FOR DESTRUCTION,Tuesday,RICHMOND,NONE,400 Block of 6TH AV,-122.46428,37.780033


In [6]:
def get_dates(d):
    dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
    return dt.hour, dt.month, dt.year

In [7]:
label_encoder = LabelEncoder()
def get_features(data):
    df = pd.DataFrame()
    df['Hour'], df['Month'], df['Year'] = zip(*data['Dates'].apply(get_dates))
    df = df.join(pd.get_dummies(data['PdDistrict'], prefix='District'))
    df = df.join(pd.get_dummies(data['DayOfWeek'], prefix='Day'))
    labels = None
    if 'Category' in data:
        labels = label_encoder.fit_transform(data['Category'])
    return df, labels

In [8]:
features, labels = get_features(train)

In [9]:
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=0)

In [11]:
label_encoder.inverse_transform(labels)

array(['VEHICLE THEFT', 'LARCENY/THEFT', 'OTHER OFFENSES', ...,
       'LARCENY/THEFT', 'BURGLARY', 'FORGERY/COUNTERFEITING'], dtype=object)

In [12]:
logistic_model_test = LogisticRegression(verbose=1)
logistic_model_test.fit(X_train, y_train)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1)

In [14]:
print 'score', logistic_model_test.score(X_test, y_test)
print 'test', log_loss(y_test, logistic_model_test.predict_proba(X_test))
print 'train', log_loss(y_train, logistic_model_test.predict_proba(X_train))

 score 0.21925
test

ValueError: y_true and y_pred have different number of classes 37, 34

In [248]:
rf_model_test = RandomForestClassifier()
rf_model_test.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [249]:
print 'score', rf_model_test.score(X_test, y_test)
print 'test', log_loss(y_test, rf_model_test.predict_proba(X_test))
print 'train', log_loss(y_train, rf_model_test.predict_proba(X_train))

score 0.1396875
test 18.8562805966
train 0.474371772417


In [250]:
ada_model_test = AdaBoostClassifier()
ada_model_test.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [251]:
print 'score', ada_model_test.score(X_test, y_test)
print 'test', log_loss(y_test, ada_model_test.predict_proba(X_test))
print 'train', log_loss(y_train, ada_model_test.predict_proba(X_train))

score 0.204875
test 3.50932290028
train 3.50794727081


In [257]:
bagging_model_test = BaggingClassifier()
bagging_model_test.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0)

In [259]:
print 'score', bagging_model_test.score(X_test, y_test)
print 'test', log_loss(y_test, bagging_model_test.predict_proba(X_test))
print 'train', log_loss(y_train, bagging_model_test.predict_proba(X_train))

score 0.137625
test 18.362185098
train 0.475359791945
