In [None]:
import gc
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import save_results
from models import least_squares, gradient_descent, ridge_regression
from preprocessing import (features_expansion, features_selection,
                                    get_initial_df, preparation, splitting, build_poly)
from visualization import snr_plot, y_compare, ridge_regression_lambdas_visualization
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [None]:
OUTPUT_FILE = "results.csv"

In [None]:
df_spectra, df_measures, df_tts = get_initial_df('data')
meta_cols = ['SiteCode', 'Date', 'flag',
             'Latitude', 'Longitude']
unc_col = 'DUSTf:Unc'

y_col = 'DUSTf:Value'

In [None]:
merged = preparation(df_spectra, df_measures, meta_cols, unc_col, y_col)
%xdel df_spectra
%xdel df_measures
print("Shape: merged: {}".format(merged.shape))

In [None]:
tx_train, y_train, _, tx_test, y_test, unc_test = splitting(merged,df_tts,meta_cols,unc_col,y_col)
print("Shapes:\n\ttx_train: {}\n\ty_train: {}\n\ttx_test: {}\n\ty_test: {}".format(tx_train.shape, y_train.shape, tx_test.shape, y_test.shape))
%xdel merged
%xdel df_tts

In [None]:
gd_initial_weights = np.random.rand(tx_train.shape[1])
print(gd_initial_weights.shape)

In [None]:
with open(OUTPUT_FILE, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["method", 'w', 'loss'])

# No feature preprocessing at all

## BASELINE

In [None]:
loss, weights_baseline = least_squares(tx_train, y_train, tx_test, y_test)
save_results(OUTPUT_FILE, "simple least squares", weights_baseline, loss)
y_compare(tx_test, y_test, weights_baseline, "ls_y-compare_simple.png")
snr_plot(tx_test, y_test, weights_baseline, unc_test, "ls_snr_simple.png")

## Gradient descent

In [None]:
loss, weights_sgd = gradient_descent(tx_train, y_train, gd_initial_weights, 1000, 0.005)
save_results(OUTPUT_FILE, "simple stochastic gradient descent", weights_baseline, loss)
y_compare(tx_test, y_test, weights_sgd, "GD_y-compare_simple.png")
snr_plot(tx_test, y_test, weights_sgd, unc_test, "GD_snr_simple.png")

### Visualization of the difference of regression weights

In [None]:
plt.plot(tx_train.columns, weights_baseline)
plt.plot(tx_train.columns, weights_sgd)
# plt.yscale('log')
plt.show()

## Least-Squares with Cross-validation

In [None]:
kf = KFold(n_splits=10, shuffle=True)
losses, ws = [],[]
for ind_tr, ind_te in kf.split(tx_train):
    loss, w = least_squares(tx_train.iloc[ind_tr].values, 
                            y_train.iloc[ind_tr].values,
                            tx_train.iloc[ind_te].values,
                            y_train.iloc[ind_te].values)
    losses.append(loss)
    ws.append(w)

weights_cross = np.mean(np.array(ws), axis=0)
loss_cross = np.mean(losses)
y_compare(tx_test, y_test, weights_cross, "ls-cv_y-compare__simple.png")
snr_plot(tx_test, y_test, weights_cross, unc_test, "ls-cv_snr_simple.png")
save_results(OUTPUT_FILE, "least-squares with cross-validation", weights_cross, loss_cross)
print("Loss is {}".format(loss_cross))

### Visualization of the difference of regression weights

In [None]:
plt.plot(tx_train.columns, weights_baseline)
plt.plot(tx_train.columns, weights_cross)
# plt.yscale('log')
plt.show()

## Ridge regression

In [None]:
lambdas = np.logspace(-17, -5, 50)
losses = []
weights = []
for lambda_ in lambdas:
    loss, w_rr = ridge_regression(tx_train, y_train, tx_test, y_test, lambda_)
    losses.append(loss)
    weights.append(w_rr)
    
ridge_regression_lambdas_visualization(lambdas, losses, "loss_with_lambdas_ridge_regression.png")
best_lambda, best_w = lambdas[np.argmin(losses)], weights[np.argmin(losses)]
save_results(OUTPUT_FILE, "ridge regression with best lambda",  best_w, np.min(losses))
y_compare(tx_test, y_test, best_w, "rr_y-compare_simple.png")
snr_plot(tx_test, y_test, best_w, unc_test, "rr_snr_simple.png")
print("Loss is {}".format(np.min(losses)))

# Now with augmentation of the data

## Expand all features with higher degrees

In [None]:
power = 7
tx_train_expanded = build_poly(tx_train, power)
tx_test_expanded = build_poly(tx_test, power)

In [None]:
# %xdel tx_train
# %xdel tx_test
%xdel ind_tr
%xdel ind_te


## Least squares augmented

In [None]:
loss_ls_hd, weights_ls_hd = least_squares(tx_train_expanded, y_train, tx_test_expanded, y_test)
save_results(OUTPUT_FILE, "least squares with features expansion", weights_ls_hd, loss)
y_compare(tx_test_expanded, y_test, weights_ls_hd, "ls_y-compare_higher-degrees-{}.png".format(power))
snr_plot(tx_test_expanded, y_test, weights_ls_hd, unc_test, "ls_snr_higher-degrees-{}.png".format(power))
print("Loss is {}".format(loss_ls_hd))

## Least squares cross validation augmented

In [None]:
kf = KFold(n_splits=10, shuffle=True)
losses, ws = [],[]
for ind_tr, ind_te in kf.split(tx_train):
    loss, w = least_squares(tx_train_expanded.iloc[ind_tr].values, 
                            y_train.iloc[ind_tr].values,
                            tx_train_expanded.iloc[ind_te].values,
                            y_train.iloc[ind_te].values)
    losses.append(loss)
    ws.append(w)

weights_cross = np.mean(np.array(ws), axis=0)
loss_cross = np.mean(losses)
y_compare(tx_test_expanded, y_test, weights_cross, "ls-cv_y-compare_higher-degrees-{}.png".format(power))
snr_plot(tx_test_expanded, y_test, weights_cross, unc_test, "ls-cv_snr_higher-degrees-{}.png".format(power))
save_results(OUTPUT_FILE, "ls-cv higher-degrees-{}".format(power), weights_cross, loss_cross)
print("Loss is {}".format(loss_cross))

## Ridge regression augmented

In [None]:
lambdas = np.logspace(-6, -5, 10)
losses = []
weights = []
for lambda_ in lambdas:
    loss, w_rr = ridge_regression(tx_train_expanded, y_train, tx_test_expanded, y_test, lambda_)
    losses.append(loss)
    weights.append(w_rr)
    print(loss)
    
ridge_regression_lambdas_visualization(lambdas, losses, "loss_with_lambdas_ridge_regression_complex_higher_degree.png")
best_lambda, best_w = lambdas[np.argmin(losses)], weights[np.argmin(losses)]
save_results(OUTPUT_FILE, "ridge regression with best lambda higher degrees",  best_w, np.min(losses))
y_compare(tx_test_expanded, y_test, best_w, "rr_y-compare_higher-degrees.png")
snr_plot(tx_test_expanded, y_test, best_w, unc_test, "rr_snr_higher-degrees.png")
print("Loss is {}".format(np.min(losses)))

## Expand 30 features with polynomial expansion

In [None]:
%xdel tx_train_expanded
%xdel tx_test_expanded
num_features = 30
power = 3
best_features = features_selection(tx_train, y_train, 30)
tx_train_expanded = features_expansion(tx_train, power, best_features)
tx_test_expanded = features_expansion(tx_test, power, best_features)

In [None]:
loss, weights_baseline = least_squares(tx_train_expanded, y_train, tx_test_expanded, y_test)
save_results(OUTPUT_FILE, "least squares with features expansion", weights_baseline, loss)
y_compare(tx_test_expanded, y_test, weights_baseline, "ls_y-compare_complex-{}-{}.png".format(num_features, power))
snr_plot(tx_test_expanded, y_test, weights_baseline, unc_test, "ls_snr_complex-{}-{}.png".format(num_features, power))
print("Loss is {}".format(loss))

In [None]:
lambdas = np.logspace(-20, -3, 10)
losses = []
weights = []
for lambda_ in lambdas:
    loss, w_rr = ridge_regression(tx_train_expanded, y_train, tx_test_expanded, y_test, lambda_)
    losses.append(loss)
    weights.append(w_rr)
    print(loss)
    
ridge_regression_lambdas_visualization(lambdas, losses, "loss_with_lambdas_ridge_regression_complex.png")
best_lambda, best_w = lambdas[np.argmin(losses)], weights[np.argmin(losses)]
save_results(OUTPUT_FILE, "ridge regression with best lambda complex",  best_w, np.min(losses))
y_compare(tx_test_expanded, y_test, best_w, "rr_y-compare_complex-{}-{}.png".format(num_features, power))
snr_plot(tx_test_expanded, y_test, best_w, unc_test, "rr_snr_complex-{}-{}.png".format(num_features, power))
print("Loss is {}".format(np.min(losses)))