In [1]:
from datetime import datetime
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import gzip
import xgboost as xgb

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

In [2]:
train = pd.read_csv('../data/train.csv.gz', parse_dates=True,
                    nrows=2000)

In [3]:
label_encoder = LabelEncoder()

def get_dates(d):
    dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
    return dt.hour, dt.month, dt.year

def get_features(data):
    df = pd.DataFrame()
    df['Hour'], df['Month'], df['Year'] = zip(*data['Dates'].apply(get_dates))
    df = df.join(pd.get_dummies(data['PdDistrict'], prefix='District'))
    df = df.join(pd.get_dummies(data['DayOfWeek'], prefix='Day'))
    labels = None
    if 'Category' in data:
        labels = label_encoder.fit_transform(data['Category'])
    return df, labels

In [4]:
features, labels = get_features(train)
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=0)

In [6]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=50, learning_rate=0.05, nthread=1)
gbm.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.05,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=50, nthread=1, objective='multi:softprob', seed=0,
       silent=True, subsample=1)

In [7]:
gbm.score(X_train, y_train)

0.32250000000000001

In [21]:
import pickle
pickle.dump(gbm, open('foo.pkl', 'wb'))

In [22]:
gbm.score(X_test, y_test)

0.22607197767780879

In [23]:
log_loss(y_train, gbm.predict_proba(X_train))

2.6941464570177911

In [24]:
log_loss(y_test, gbm.predict_proba(X_test))

2.6965389413687944

In [10]:
test = pd.read_csv('../data/test.csv.gz', parse_dates=True,
                   nrows=10000)

In [11]:
test_features, _ = get_features(test)

scaler.fit(test_features)
test_features = scaler.transform(test_features)

In [13]:
proba = gbm.predict_proba(test_features)
prediction = pd.DataFrame(proba, columns=label_encoder.classes_)