# Imports

In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from scipy import stats
import sklearn
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import PolynomialFeatures
#importing the models
import Kmeans
import ALS
import NN
import Surprize

# Find predictors weights

In [None]:
#useful constants
submission_path='submission.csv'
training_path = "data/data_train.csv"
format_path = "data/sampleSubmission.csv"

In [None]:
#Loading the data
print("Loading datasets")
try:
    input_ = pd.read_csv(training_path)
    format_ = pd.read_csv(format_path)
except FileNotFoundError:
    print("Impossible to load training or format files, "
          "please double check")

In [None]:
#Splitting the data
np.random.seed(1)
train, test =sklearn.model_selection.train_test_split(input_,test_size=0.1)

In [None]:
#computing the predictions of the Surprize algorithm
predictions_surprize_test = Surprize.main(train.copy(), test.copy(), 
                                          cache_name="test")

# #computing the predictions of the ALS algorithm
predictions_als_test=ALS.main(train.copy(), test.copy())

#computing the best prediction of the kmeans algorithm
k=6
predictions_kmeans_test = Kmeans.main(train.copy(), test.copy(), k, rounded=False)

#computing the prediction of the NN algorithm
predictions_nn_test = NN.main(train.copy(), test.copy())

In [None]:
true_ratings_test = test.copy()
true_ratings_test.set_index("Id", inplace=True)
true_ratings_test.columns = ["y"]
concat_test = pd.concat([predictions_als_test, 
                    predictions_kmeans_test, 
                    predictions_nn_test, 
                    predictions_surprize_test, 
                    true_ratings_test], axis=1, sort=False)

In [None]:
concat_test.head()

## Augmentation

In [None]:
def augmentation(concat):
    poly = PolynomialFeatures(3)
    X = concat.loc[:,~(concat.columns == "y")]
    print("Augmenting {} columns".format(len(X.columns)))
    if "y" in concat.columns:
        y = concat.loc[:, "y"]
    else:
        y = None
    X = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names(), index=concat.index)
    return X, y

In [None]:
X_test, y_test = augmentation(concat_test)

In [None]:
rr = RidgeCV(alphas=np.linspace(100, 250, 60), store_cv_values=True).fit(X_test,y_test)
predictor_coefficients = dict(zip(X_test.columns, rr.coef_))

In [None]:
plt.plot(np.linspace(100, 250, 60), rr.cv_values_.mean(axis=0));
plt.xlabel("Ridge Lambda")
plt.ticklabel_format(useOffset=False)
plt.ylabel("MSE")
plt.title("Optimization of hyperparameters for Ridge")
plt.grid()
plt.savefig("figures/crossval_ridge.png")
plt.show()

# Final training and predictions

In [None]:
predictions_surprize_final = Surprize.main(input_.copy(), format_.copy(), 
                                           cache_name="final")

predictions_als_final=ALS.main(input_.copy(), format_.copy(), cache_name="final")

k=6
predictions_kmeans_final = Kmeans.main(input_.copy(), format_.copy(), k, rounded=False)

predictions_nn_final = NN.main(input_.copy(), format_.copy())

In [None]:
concat_final = pd.concat([predictions_als_final, 
                    predictions_kmeans_final, 
                    predictions_nn_final, 
                    predictions_surprize_final], axis=1, sort=False)
concat_aug_final, _ = augmentation(concat_final)
del concat_final
concat_aug_final["Prediction"] = rr.intercept_
for col in concat_aug_final:
    if col != "Prediction":
        concat_aug_final["Prediction"] += concat_aug_final.loc[:, col]*predictor_coefficients[col]
concat_aug_final["Prediction"] = concat_aug_final["Prediction"].apply(lambda x: int(np.clip(np.round(x),1,5)))
concat_aug_final.index.name = "Id"

In [None]:
concat_aug_final.to_csv(submission_path, columns=["Prediction"])