In [4]:
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 10)

import matplotlib.pyplot as plt
import psycopg2
import numpy as np
import sklearn
import requests
import scipy

Populating the interactive namespace from numpy and matplotlib


In [5]:
conn = psycopg2.connect(database="delay", user="nikita")
conn.autocommit = True

In [6]:
line, way = 95, 2

cur = conn.cursor()
cur.execute("SELECT * FROM traject WHERE line='%i' AND way=%i ORDER BY id;" % (line, way))
data = cur.fetchall()

In [7]:
from collections import namedtuple
Traject = namedtuple("Traject", ('id', 'line', 'way', 'timestamps', 'start'))
models = [Traject(*row) for row in data]

In [8]:
stib_stops = requests.get("https://stib-mivb-api.herokuapp.com/network/line/%s/%s" % (line, way)).json()['stops']
len_stops = len(max(models, key=lambda x: len(x.timestamps)).timestamps)
assert len(stib_stops) == len_stops

In [9]:
vectors = []
for model in models:
    vector = [np.nan] * (len_stops - 1)
    
    for i in range(len_stops - 1):
        try:
            vector[i] = (model.timestamps[i + 1] - model.timestamps[i]).total_seconds()
        except (IndexError, TypeError):
            vector[i] = np.nan
    vectors.append(vector)

vectors = np.array(vectors)

In [10]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN", strategy='mean', axis=0)
imputed_vectors = imputer.fit_transform(vectors)

# Split the vectors in 2: features are v[:12] and targets are v[12:]
X, Y = imputed_vectors[:,:12], imputed_vectors[:,12:]

In [11]:
def score_distance(diff):
    return np.linalg.norm(diff)

In [12]:
def model_mean(errors):
    return np.mean(errors)

In [30]:
%%time
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import sys
import time
    

def to_minimize(args):
    k = args[0]
    weights = args[1:]
    def add_weights(row):
        return row * weights
    weighted_X = np.apply_along_axis(add_weights, 1, X)
    X_train, X_test, Y_train, Y_test = train_test_split(weighted_X, Y, test_size=0.33, random_state=42)
    knn = KNeighborsRegressor(n_neighbors=int(k), n_jobs=-1, weights="distance").fit(X_train, Y_train)
    predictions = knn.predict(X_test)
    
    diff = Y_test - predictions
    predictions_scores = np.apply_along_axis(score_distance, 1, diff)
    return model_mean(predictions_scores)

x0=np.ones(X.shape[1] + 1)
x0[0] = 400
x0 = np.array([  4.00000000e+02,   1.75629269e+00,   9.83732010e-01,
         1.62505424e+00,   1.01162580e+00,   1.08621018e+00,
         9.98733921e-01,   1.00000601e+00,   9.99916390e-01,
         1.46419695e+00,   1.10059461e+00,   3.60679791e-01,
         1.23606797e+00])
x0 = np.array([  4.00000000e+02,   1.74654691e+00,   9.84122877e-01,
         1.62503653e+00,  -5.79791608e-01,   8.44097232e-01,
         9.98533140e-01,   1.00000575e+00,   1.00019548e+00,
         1.80731763e+00,   1.10009561e+00,   3.60807133e-01,
         1.23609470e+00])

xopt = scipy.optimize.minimize(to_minimize, x0, method='Powell', options={'maxiter': 1})

KeyboardInterrupt: 

# Comparons avec la STIB

In [27]:
xopt

   direc: array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])
     fun: 59.08371624471349

In [29]:
xopt

   direc: array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])
     fun: 59.04461142042652

In [None]:
knn = KNeighborsRegressor(n_neighbors=405, n_jobs=-1, weights="distance").fit(X_train, Y_train)
predictions = knn.predict(X_test)

In [None]:
stib_errors = []
knn_errors = []
a = []
for prediction, features, expected in zip(predictions, X_test, Y_test):
    indexes = np.where((X == features).all(axis=1))[0]
    if len(indexes) == 1:
        index = indexes[0]
        if index > 3 :#and not np.array_equal(expected,prediction):
            stib = np.mean(Y[index-3:index], axis=0)
            stib_errors.append(score_distance(expected - stib))
            knn_errors.append(score_distance(expected - prediction))

In [None]:
colors = ["red" if k < s else "blue" for s, k in zip(stib_errors, knn_errors)]
plt.scatter(stib_errors, knn_errors, s=2, c=colors, edgecolors='none')
m = min(max(stib_errors), max(knn_errors))
plt.plot([0, m], [0, m], linestyle="dashed", color="gray")
plt.title("Comparaison des erreurs de la STIB vs kNN")
plt.xlabel("Erreur de la STIB")
plt.ylabel("Erreur du kNN")
plt.xlim(xmin=0)
plt.ylim(ymin=0);

In [None]:
colors = ["red" if k < s else "blue" for s, k in zip(stib_errors, knn_errors)]
plt.scatter(stib_errors, knn_errors, s=4, c=colors, edgecolors='none')
m = min(max(stib_errors), max(knn_errors))
plt.plot([0, m], [0, m], linestyle="dashed", color="gray")
plt.title("Comparaison des erreurs de la STIB vs kNN (limité aux erreurs < 300)")
plt.xlabel("Erreur de la STIB")
plt.ylabel("Erreur du kNN")
plt.xlim(0,300)
plt.ylim(0,300);

In [None]:
knn_better = [k < s for s, k in zip(stib_errors, knn_errors)]

kkn_better_percent = len([x for x in knn_better if x]) / len(knn_better)
print("Knn est meilleur dans {:.2%}% des cas".format(kkn_better_percent))

In [None]:
knn1_better = [k * 1.5 < s for s, k in zip(stib_errors, knn_errors)]

kkn1_better_percent = len([x for x in knn1_better if x]) / len(knn1_better)
print("Knn est 1.5x meilleur dans {:.2%}% des cas".format(kkn1_better_percent))

In [None]:
knn2_better = [k*2 < s for s, k in zip(stib_errors, knn_errors)]

kkn2_better_percent = len([x for x in knn2_better if x]) / len(knn2_better)
print("Knn est 2x meilleur dans {:.2%}% des cas".format(kkn2_better_percent))

In [None]:
knn2_worst = [s*2 < k for s, k in zip(stib_errors, knn_errors)]

knn2_worst_percent = len([x for x in knn2_worst if x]) / len(knn2_worst)
print("Knn est 2x pire dans {:.2%}% des cas".format(knn2_worst_percent))

In [None]:
knn1_worst = [s*1.5 < k for s, k in zip(stib_errors, knn_errors)]

knn1_worst_percent = len([x for x in knn1_worst if x]) / len(knn1_worst)
print("Knn est 1.5x pire dans {:.2%}% des cas".format(knn1_worst_percent))

In [18]:
1

1