In [1]:
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 10)

import matplotlib.pyplot as plt
import psycopg2
import numpy as np
import sklearn
import requests
import scipy

Populating the interactive namespace from numpy and matplotlib


In [2]:
conn = psycopg2.connect(database="delay", user="nikita")
conn.autocommit = True

In [3]:
line, way = 95, 2

cur = conn.cursor()
cur.execute("SELECT * FROM traject WHERE line='%i' AND way=%i ORDER BY id;" % (line, way))
data = cur.fetchall()

In [4]:
from collections import namedtuple
Traject = namedtuple("Traject", ('id', 'line', 'way', 'timestamps', 'start'))
models = [Traject(*row) for row in data]

In [5]:
stib_stops = requests.get("https://stib-mivb-api.herokuapp.com/network/line/%s/%s" % (line, way)).json()['stops']
len_stops = len(max(models, key=lambda x: len(x.timestamps)).timestamps)
assert len(stib_stops) == len_stops

In [6]:
vectors = []
for model in models:
    vector = [np.nan] * (len_stops - 1)
    
    for i in range(len_stops - 1):
        try:
            vector[i] = (model.timestamps[i + 1] - model.timestamps[i]).total_seconds()
        except (IndexError, TypeError):
            vector[i] = np.nan
    vectors.append(vector)

vectors = np.array(vectors)

In [7]:
# Remove vectors with too much NaN
bools = np.apply_along_axis(lambda x: len(list(filter(lambda y: np.isnan(y) ,x))) < 18, 1 , vectors)
good_vectors = vectors[bools]

In [9]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN", strategy='mean', axis=0)
imputed_vectors = imputer.fit_transform(good_vectors)

# Split the vectors in 2: features are v[:12] and targets are v[12:]
X, Y = imputed_vectors[:,6:12], imputed_vectors[:,12:]

from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [10]:
from sklearn.neighbors import KNeighborsRegressor

In [11]:
def score_distance(diff):
    return np.linalg.norm(diff)

In [12]:
def model_mean(errors):
    return np.mean(errors)

# Cherchons le meilleur k

In [13]:
def score_k(k):
    knn = KNeighborsRegressor(n_neighbors=int(k), n_jobs=-1).fit(X_train, Y_train)
    predictions = knn.predict(X_test)
    
    diff = Y_test - predictions
    predictions_scores = np.apply_along_axis(score_distance, 1, diff)
    return model_mean(predictions_scores)


opts = {'maxiter': 20, 'disp': True, 'xatol': 0.1}
res = scipy.optimize.minimize_scalar(score_k, bounds=(1, 1000), method='bounded', options=opts)
print("Best k:", int(res.x))
print(res)

Best k: 582
     fun: 140.53599809127573
 message: 'Solution found.'
    nfev: 18
  status: 0
 success: True
       x: 582.72327797168964


# Cherchons le meilleur k (avec du weighted)

In [15]:
def score_weighted_k(k):
    knn = KNeighborsRegressor(n_neighbors=int(k), n_jobs=-1, weights="distance").fit(X_train, Y_train)
    predictions = knn.predict(X_test)
    
    diff = Y_test - predictions
    predictions_scores = np.apply_along_axis(score_distance, 1, diff)
    return model_mean(predictions_scores)


opts = {'maxiter': 30, 'disp': True, 'xatol': 0.1}
res = scipy.optimize.minimize_scalar(score_weighted_k, bounds=(1, 1000), method='bounded', options=opts)
print("Best k:", int(res.x))
print(res)

Best k: 990
     fun: 60.398182363155811
 message: 'Solution found.'
    nfev: 20
  status: 0
 success: True
       x: 990.72901743030832
