In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook
import preprocessing
from sklearn.model_selection import KFold

In [None]:
df_spectra_raw, df_measures_raw, df_train_test_split_raw = preprocessing.get_initial_df('data')
meta_cols = ['SiteCode', 'Date', 'flag','Latitude', 'Longitude', 'DUSTf:Unc']
y_col = ['DUSTf:Value']


In [None]:
merged = preprocessing.preparation(df_spectra_raw,df_measures_raw,meta_cols,y_col)
%xdel df_spectra_raw
%xdel df_measures_raw

In [None]:
X, y, X_test, y_test = preprocessing.splitting(merged, df_train_test_split_raw, meta_cols, y_col)
%xdel merged
%xdel df_train_test_split_raw

In [None]:
best_features = preprocessing.features_selection(X, y, 30)
X = preprocessing.features_expansion(X, 4, best_features)


# Cross validation

###### build_k_indices will help to divide the X into 10 partitions, then shuffle it.

In [None]:
def build_k_indices(num_row,k_fold, seed):
    """build k indices for k-fold."""
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

##### compute mse

In [None]:
def compute_mse(y, tx, w):
    """compute the loss by mse."""
    e = y - tx.dot(w)
    mse = 1/2*np.mean(e**2)
    return mse


#####  ridge regression

In [None]:
def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    aI = 2 * tx.shape[0] * lambda_ * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    return np.linalg.solve(a, b)

#### cross validation

In [None]:
def cross_validation(x_tr, y_tr, x_te, y_te, lambda_):
    """return the loss of ridge regression."""
    w = ridge_regression(y_tr, x_tr, lambda_)
    # calculate the loss for train and test data
    loss_tr = np.sqrt(2 * compute_mse(y_tr, x_tr, w))
    loss_te = np.sqrt(2 * compute_mse(y_te, x_te, w))
    return loss_tr, loss_te, w

###### demo

##### cross_validation_visualization

In [None]:
def cross_validation_visualization(lambds, mse_tr, mse_te):
    """visualization the curves of mse_tr and mse_te."""
    plt.semilogx(lambds, mse_tr, marker=".", color='b', label='train error')
    plt.semilogx(lambds, mse_te, marker=".", color='r', label='test error')
    plt.xlabel("lambda")
    plt.ylabel("rmse")
    plt.title("cross validation")
    plt.legend(loc=2)
    plt.grid(True)
    plt.savefig("cross_validation")

In [None]:
def cross_validation_demo():
#     seed = 12
    k_fold = 10
    lambdas = np.logspace(-4, 0, 30)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    # cross validation
    
    for lambda_ in lambdas:
        rmse_tr_tmp = []
        rmse_te_tmp = []
        kf = KFold(n_splits=k_fold, shuffle=True)
        
        for ind_tr, ind_te in kf.split(X):
            loss_tr, loss_te,_ = cross_validation(X.loc[X.index[ind_tr]], y.loc[y.index[ind_tr]], X_test.loc[X_test.index[ind_tr]], y_test.loc[y_test.index[ind_tr]], lambda_,)
            rmse_tr_tmp.append(loss_tr)
            rmse_te_tmp.append(loss_te)
        rmse_tr.append(np.mean(rmse_tr_tmp))
        rmse_te.append(np.mean(rmse_te_tmp))
    return lambdas, rmse_tr, rmse_te

lambdas, rmse_tr, rmse_te = cross_validation_demo()
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
def calculate_mae(e):
    """Calculate the mae for vector e."""
    return np.mean(np.abs(e))

In [None]:
kf = KFold(n_splits=5, shuffle=True)
for x in kf.split(X):
    ind_tr = x[0]
    break

In [None]:
X.loc[X.index[ind_tr]]