In [48]:
import numpy as np
# import scipy
import pandas as pd
from pandas import Series, DataFrame
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_regression, SelectKBest
import heapq

In [73]:
# Import data, merge data from weather.csv to other sources
df_train = pd.read_csv('citibike_train.csv')
df_weather = pd.read_csv('weather.csv')
df_train = pd.merge(df_train, df_weather)  # inner-merge on 'date'
col_names = ('holiday', 'n_stations', 'AWND', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN')

# Create X, y that splitted from merged raw data, which are predictors(features) & response variables, respectively
X = df_train.loc[:, col_names]
# Add we scale the value of predictors to let k-NN make sense
scaler = MinMaxScaler()
X.loc[:, col_names] = scaler.fit_transform(X.loc[:, col_names])
y = df_train['trips'].values

In [59]:
# k-NN Implementation as requested in Part-I
def myknn(xtrain, xtest, ytrain, k):
    ytest = []
    for sample in xtest:
        pq = [(np.linalg.norm(sample - x), y) for x, y in zip(xtrain, ytrain)]
        heapq.heapify(pq)
        # nsmallest automatically took care of k > len(pq) situations
        ytest.append(sum([y for _, y in heapq.nsmallest(k, pq)]) / k)
    return np.array(ytest)

# For each hyper-parameter k, evaluate the performance of k-NN regression algorithm using metrics MSE
def knn_eval(X, y, k):
    mse, cv = [], ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
    for ind_train, ind_test in cv.split(y):
        y_hat = myknn(X[ind_train], X[ind_test], y[ind_train], k)
        mse.append(mean_squared_error(y_hat, y[ind_test]))
    return np.array(mse).mean()

In [61]:
# Call knn_eval() to find out the optimal hyper-parameter k, which is k=5
for k in range(1, 15):
    print('k =', k, ', MSE =', knn_eval(X.values, y, k))

k = 1 , MSE = 36097413.7629
k = 2 , MSE = 31817278.1957
k = 3 , MSE = 27579732.7664
k = 4 , MSE = 25534499.9962
k = 5 , MSE = 24973220.943
k = 6 , MSE = 25231560.2922
k = 7 , MSE = 25105194.0267
k = 8 , MSE = 25724942.4863
k = 9 , MSE = 25685613.7176
k = 10 , MSE = 26445754.0921
k = 11 , MSE = 26819659.6065
k = 12 , MSE = 26939161.1318
k = 13 , MSE = 27339775.7475
k = 14 , MSE = 27577721.4778


In [62]:

df_scores = DataFrame(index=col_names)
df_scores['f_scores'], df_scores['p_values'] = f_regression(X, y)  # ANOVA
print(df_scores, '\n')
# selector = RFECV(LinearRegression(), step=1, cv=5)
for _k in range(8, 0, -1):
    selector = SelectKBest(score_func=f_regression, k=_k)
    predictors = X.columns[selector.fit(X, y).get_support()]
    score =  - cross_val_score(LinearRegression(), X[predictors],
                                 y, cv=5, scoring='neg_mean_squared_error').mean()
    print('n_predictors =', _k, ', MSE =', score)

               f_scores       p_values
holiday       31.330383   2.811084e-08
n_stations    57.975122   6.136093e-14
AWND           3.013424   8.288599e-02
PRCP          76.429654   9.516125e-18
SNOW          77.979643   4.592917e-18
SNWD         278.913381   2.072658e-55
TMAX        1357.607275  2.223230e-188
TMIN        1177.324491  4.217873e-171 

n_predictors = 8 , MSE = 33635189.8286
n_predictors = 7 , MSE = 33825871.9043
n_predictors = 6 , MSE = 36075188.8484
n_predictors = 5 , MSE = 53137076.0568
n_predictors = 4 , MSE = 60436713.2585
n_predictors = 3 , MSE = 61306330.0529
n_predictors = 2 , MSE = 64852674.2832
n_predictors = 1 , MSE = 64542467.085


In [76]:
# As optimal k-NN seemed better than linear models (with MSE=24973220.943), 
# we select k-NN with k=5 to predict test data.
df_test = pd.merge(pd.read_csv('citibike_test.csv'), df_weather)
X_test = scaler.transform(df_test.loc[:, col_names])
y_predict = myknn(X[predictors].values, X_test, y, 5)
DataFrame({'date': df_test['date'], 'trips': y_predict}).to_csv('HW1_kd538.csv')