In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from patsy import dmatrix, dmatrices
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB



In [2]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))

In [3]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [18]:
train_df = preprocess(get_random_subset(train, 5000))

In [5]:
print train_df.describe()

                   X              Y           Hour          Month  \
count  877982.000000  877982.000000  877982.000000  877982.000000   
mean     -122.422763      37.767035      13.412737       6.436416   
std         0.025285       0.024165       6.549521       3.428998   
min      -122.513642      37.707879       0.000000       1.000000   
25%      -122.432952      37.752427       9.000000       3.000000   
50%      -122.416420      37.775421      14.000000       6.000000   
75%      -122.406959      37.784368      19.000000       9.000000   
max      -122.364937      37.819975      23.000000      12.000000   

        Hour_Minutes  Minutes_Since_03  Minutes_Since_New_Year            DOW  \
count  877982.000000     877982.000000           877982.000000  877982.000000   
mean       13.748657    3263716.192895           259058.977781       2.992719   
std         6.560013    1908460.344165           151054.164970       1.972028   
min         0.016667       7201.000000                

In [6]:
training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'X+Y+Hour'
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
# print x_train
# y_train = training.Category

x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_validation = validation.Category

x_validation = x_validation[y_validation.isin(y_train.values)]
# y_validation = y_validation[y_validation.isin(y_train.values)]
# mlb = MultiLabelBinarizer(classes=alg.classes_)
# print y_validation
# y_validation = mlb.fit_transform(np.array([y_validation]).T)

num_trees = [5, 10, 50, 250]
min_leaves = [10, 50, 500, 2500, 10000, 50000]

for trees in num_trees:
    scores = []
    for l in min_leaves:
        alg = RandomForestClassifier(min_samples_leaf=l)
        alg.fit(x_train, y_train)
        # alg = BernoulliNB()
        y_validation = validation.Category
        y_validation = y_validation[y_validation.isin(y_train.values)]
        mlb = MultiLabelBinarizer(classes=alg.classes_)
        y_validation = mlb.fit_transform(np.array([y_validation]).T)

        predictions = np.array(alg.predict_proba(x_validation))
        scores.append(log_loss(y_validation, predictions))
    #     print "Min leaf " + str(l) + ": " + str(log_loss(y_validation, predictions))
    plt.plot(min_leaves, scores, label=(str(trees) + " trees"))
plt.legend()
plt.gca().set_xscale('log')

NameError: name 'y_train' is not defined

In [None]:
training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
# print x_train
# y_train = training.Category

x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_validation = validation.Category

x_validation = x_validation[y_validation.isin(y_train.values)]
# y_validation = y_validation[y_validation.isin(y_train.values)]
# mlb = MultiLabelBinarizer(classes=alg.classes_)
# print y_validation
# y_validation = mlb.fit_transform(np.array([y_validation]).T)

weights = np.linspace(0.7, 1, 10)
scores = []

for w in weights:
    alg1 = RandomForestClassifier(min_samples_leaf=1000)
    alg2 = BernoulliNB()
    alg1.fit(x_train, y_train)
    alg2.fit(x_train, y_train)
    # alg = BernoulliNB()
    y_validation = validation.Category
    y_validation = y_validation[y_validation.isin(y_train.values)]
    mlb = MultiLabelBinarizer(classes=alg1.classes_)
    y_validation = mlb.fit_transform(np.array([y_validation]).T)

    predictions1 = np.array(alg1.predict_proba(x_validation))
    predictions2 = np.array(alg2.predict_proba(x_validation))
    predictions = (w * predictions1 + (1-w) * predictions2)
    score = log_loss(y_validation, predictions)
    scores.append(score)
    #     print "Min leaf " + str(l) + ": " + str(log_loss(y_validation, predictions))
plt.plot(weights, scores)
plt.xlabel("Percentage forest")
plt.ylabel("score")
# plt.gca().set_xscale('log')

In [None]:

formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
y_vals = train_df.Category

min_leaves = [10, 50, 500, 2500, 10000, 50000]

parameters = {'min_samples_leaf':min_leaves}

clf = GridSearchCV(RandomForestClassifier(), parameters, scoring='log_loss')

clf.fit(x_vals, y_vals)

print clf.grid_scores_

In [None]:
# k-Nearest Neighbor
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# load iris the datasets
dataset = train.load_iris()
# fit a k-nearest neighbor model to the data
alg = KNeighborsClassifier()
alg.fit(dataset.data, dataset.target)
print(alg)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

# k-Nearest Neighbor
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# load iris the datasets
dataset = datasets.load_iris()
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [9]:
X = train.X
y = [train.Y]
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X, y) 
print(neigh.predict([[1.5]]))


SyntaxError: invalid syntax (<ipython-input-9-d9bd0e7e1e40>, line 2)

In [None]:
training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
# print x_train
# y_train = training.Category

x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_validation = validation.Category

x_validation = x_validation[y_validation.isin(y_train.values)]
# y_validation = y_validation[y_validation.isin(y_train.values)]
# mlb = MultiLabelBinarizer(classes=alg.classes_)
# print y_validation
# y_validation = mlb.fit_transform(np.array([y_validation]).T)

weights = np.linspace(0.7, 1, 10)
scores = []

for w in weights:
    alg1 = MultinomialNB()
    alg.fit(x_train, y_train)
    # alg = BernoulliNB()
    y_validation = validation.Category
    y_validation = y_validation[y_validation.isin(y_train.values)]
    mlb = MultiLabelBinarizer(classes=alg1.classes_)
    y_validation = mlb.fit_transform(np.array([y_validation]).T)

    predictions1 = np.array(alg1.predict_proba(x_validation))
    predictions2 = np.array(alg2.predict_proba(x_validation))
    predictions = (w * predictions1 + (1-w) * predictions2)
    score = log_loss(y_validation, predictions)
    scores.append(score)
    #     print "Min leaf " + str(l) + ": " + str(log_loss(y_validation, predictions))
plt.plot(weights, scores)
plt.xlabel("Percentage forest")
plt.ylabel("score")
# plt.gca().set_xscale('log')

In [10]:
training, validation = train_test_split(train_df, train_size=.60)
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
y_train = training.Category

x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_validation = validation.Category

x_validation = x_validation[y_validation.isin(y_train.values)]

from sklearn.neural_network import MLPClassifier
X = x_train
y = y_train
clf = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X, y) 
MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
       batch_size=200, beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

y_validation = validation.Category
y_validation = y_validation[y_validation.isin(y_train.values)]
mlb = MultiLabelBinarizer(classes=alg1.classes_)
y_validation = mlb.fit_transform(np.array([y_validation]).T)

predictions1 = np.array(alg1.predict_proba(x_validation))
predictions2 = np.array(alg2.predict_proba(x_validation))
predictions = (w * predictions1 + (1-w) * predictions2)
score = log_loss(y_validation, predictions)
scores.append(score)

ImportError: cannot import name MLPClassifier

In [23]:
from numpy import newaxis
from sknn.mlp import Regressor, Layer

training, validation = train_test_split(train_df, train_size=.60)
x_train = dmatrix(formula_ml, data=training)
y_train = training.Category

#data_array = x_train.values
#for train_index, test_index in sss:
#    xtrain, xtest = data_array[train_index], data_array[test_index]
#    ytrain, ytest = target[train_index], target[test_index]

f = 'Category'
y_train_vals = dmatrix(f, data = train_df)[newaxis, :]
#x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
#y_vals = train_df.Category


nn = Regressor(
    layers=[
        Layer("Rectifier", units=100),
        Layer("Linear")],
    learning_rate=0.02,
    n_iter=10)
print x_train.shape
print y_train.shape
nn.fit(x_train[0:4000], y_train_vals[0:4000])

y_validation = validation.Category
y_validation = y_validation[y_validation.isin(y_train.values)]
mlb = MultiLabelBinarizer(classes=nn.classes_)
y_validation = mlb.fit_transform(np.array([y_validation]).T)
predictions = np.array(alg.predict_proba(x_validation))
score = log_loss(y_validation, predictions)
scores.append(score)

print scores

(2999, 21)
(2999,)


AssertionError: Expecting same number of input and output samples.