In [1]:
from datetime import datetime
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import gzip

In [2]:
from holidays import US
us_holidays = US()

In [3]:
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

In [4]:
train = pd.read_csv('../data/train.csv.gz', parse_dates=True)

In [5]:
test = pd.read_csv('../data/test.csv.gz', parse_dates=True)

In [6]:
street_encoder = LabelEncoder()

def get_street_name(addr):
    sp = addr.split(' ')
    if sp[0].isdigit():
        return ' '.join(sp[1:])
    return addr

In [7]:
streets = pd.concat([train['Address'].apply(get_street_name), test['Address'].apply(get_street_name)])
street_encoder.fit(streets.unique())

LabelEncoder()

In [8]:
seasons = dict(zip([1,2,3,4,5,6,7,8,9,10,11,12], ['Winter','Winter','Spring','Spring','Spring','Summer','Summer','Summer','Autumn','Autumn','Autumn','Winter']))
def get_season(d):
    return seasons[d]

In [9]:
label_encoder = LabelEncoder()

def get_dates(d):
    dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
    season = get_season(dt.month)
    # http://www.timeanddate.com/sun/usa/san-francisco
    if season == 'Winter':
        isDaytime = int(6 < dt.hour < 18)
    elif season == 'Summer':
        isDaytime = int(5 < dt.hour < 21)
    else:
        isDaytime = int(6 < dt.hour < 19)
    is_holiday = dt in us_holidays
    return dt.hour, dt.month, dt.year, season, isDaytime, is_holiday

def get_features(data):
    df = pd.DataFrame()
    df['Hour'], df['Month'], df['Year'], seasons, df['IsDaytime'], df['IsHoliday'] = zip(*data['Dates'].apply(get_dates))
    df['isWeekend'] = data['DayOfWeek'].apply(lambda x: int(x == 'Saturday' or x == 'Sunday'))
    df = df.join(pd.get_dummies(data['PdDistrict'], prefix='District'))
    df = df.join(pd.get_dummies(data['DayOfWeek'], prefix='Day'))
    df = df.join(pd.get_dummies(seasons, prefix='Season'))
    xy_fitted = StandardScaler().fit_transform(data[['X', 'Y']])
    df['X'] = xy_fitted[:, 0]
    df['Y'] = xy_fitted[:, 1]
    df['IsIntersection'] = data['Address'].map(lambda x: int('/' in x))
    df['Street'] = street_encoder.transform(data['Address'].apply(get_street_name))
    labels = None
    if 'Category' in data:
        labels = label_encoder.fit_transform(data['Category'])
    return df, labels

In [10]:
features, labels = get_features(train)

In [11]:
scaler = StandardScaler()
scaler.fit(features)
features_scaled = scaler.transform(features)

In [12]:
print 'Started', datetime.now()
model = LogisticRegression()
model.fit(features_scaled, labels)
print 'Finished', datetime.now()
model

Started 2015-10-01 00:23:40.240376
Finished 2015-10-01 00:50:52.617660


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [13]:
del features

In [14]:
print 'score', model.score(features_scaled, labels)

score 0.232267219711


In [15]:
print 'logloss', log_loss(labels, model.predict_proba(features_scaled))

logloss 2.53301119268


In [16]:
test_features, _ = get_features(test)

scaler.fit(test_features)
test_features = scaler.transform(test_features)

In [18]:
proba = model.predict_proba(test_features)
prediction = pd.DataFrame(proba, columns=label_encoder.classes_)

In [None]:
with gzip.GzipFile('lrv2.csv.gz', mode='w', compresslevel=9) as f:
    prediction.to_csv(f, index_label="Id", na_rep="0", float_format='%11.5f')