# Imports

In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from scipy import stats
import sklearn
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
#importing the models
import Kmeans
import ALS
import NN
import Surprize

# Final Functions

In [None]:
def median_vote(predictions):
    #computing the median
    median=np.median(predictions,axis=0)
    #making sure the result is an int (not the case if the amount of predictions is even)
    bounded_median=np.floor(median)
    return bounded_median

In [None]:
def mode_vote(predictions):
    #computing the mode
    return stats.mode(predictions,axis=0)[0]

In [None]:
def maj_vote(predictions):
    top=predictions.iloc[0][:,np.newaxis]
    mode=stats.mode(predictions,axis=0)
    freq=mode[1].T/predictions.shape[0]
    return np.where(freq<0.5,top,mode[0].T)

In [None]:
def cluster_vote(predictions):
    return np.round(np.mean(predictions,0))

In [None]:
def vote(voting_f):
    #useful constants
    submission_path='submission.csv'
    training_path = "data/data_train.csv"
    format_path = "data/sampleSubmission.csv"
    #Loading the data
    print("Loading datasets")
    try:
        input_ = pd.read_csv(training_path)
        format_ = pd.read_csv(format_path)
    except FileNotFoundError:
        print("Impossible to load training or format files, "
              "please double check")
        return pd.DataFrame([])
    #computing the prediction of the ALS algorithm
    predictions=ALS.main(input_.copy(), format_.copy())
    print(predictions.head(),format_.head())
    #computing multiple predictions of the kmeans algorithm
    for k in [6]:
        predictions=predictions.merge(Kmeans.main(input_.copy(), format_.copy(), k),on='Id')
        print(predictions.shape)
    #computing the prediction of the NN algorithm
    predictions=predictions.merge(NN.main(input_.copy(), format_.copy()),on='Id')
    print(predictions.shape)
    #setting 'Id' as the index of the aggregation of predictions
    predictions.set_index('Id', inplace=True)
    #finding the best prediction through the voting function
    print('Voting...')
    predictions['Prediction']=voting_f(predictions.T)
    #exporting the final prediction using the submission path
    print('Exporting the final prediction...')
    predictions[['Prediction']].to_csv(submission_path)
    print('Done!')
    return predictions[['Prediction']]

In [None]:
coco=vote(cluster_vote)

# Find predictors weights

In [None]:
#useful constants
submission_path='submission.csv'
training_path = "data/data_train.csv"
format_path = "data/sampleSubmission.csv"

In [None]:
#Loading the data
print("Loading datasets")
try:
    input_ = pd.read_csv(training_path)
    format_ = pd.read_csv(format_path)
except FileNotFoundError:
    print("Impossible to load training or format files, "
          "please double check")

In [None]:
#Splitting the data
np.random.seed(1)
train, test =sklearn.model_selection.train_test_split(input_,test_size=0.1)

In [None]:
#computing the predictions of the Surprize algorithm
predictions_surprize_test = Surprize.main(train.copy(), test.copy(), 
                                          cache_name="test", force_recompute=["SVD", "NMF"])

# #computing the predictions of the ALS algorithm
# predictions_als_test=ALS.main(train.copy(), test.copy())

# #computing the best prediction of the kmeans algorithm
# k=6
# predictions_kmeans_test = Kmeans.main(train.copy(), test.copy(), k, rounded=False)

# #computing the prediction of the NN algorithm
# predictions_nn_test = NN.main(train.copy(), test.copy())

In [None]:
true_ratings_test = test.copy()
true_ratings_test.set_index("Id", inplace=True)
true_ratings_test.columns = ["y"]
concat_test = pd.concat([predictions_als_test, 
                    predictions_kmeans_test, 
                    predictions_nn_test, 
                    predictions_surprize_test, 
                    true_ratings_test], axis=1, sort=False)

In [None]:
concat_test.head()

In [None]:
# preds_train, preds_test = train_test_split(concat)
X = concat_test.loc[:,~(concat_test.columns == "y")]
y = concat_test.loc[:, "y"]
rr = RidgeCV(alphas=np.linspace(1e-5, 5, 3000), store_cv_values=True).fit(X,y)
predictor_coefficients = dict(zip(X.columns, rr.coef_))

In [None]:
rr.score(X,y)

In [None]:
rr.score(X,y)

In [None]:
plt.plot(np.linspace(1e-5, 5, 3000), rr.cv_values_.mean(axis=0));

In [None]:
predictor_coefficients

# Final training and predictions

In [None]:
predictions_surprize_final = Surprize.main(input_.copy(), format_.copy(), 
                                           cache_name="final", force_recompute=["SVD", "NMF"])
predictions_als_final=ALS.main(input_.copy(), format_.copy(), cache_name="final")

#computing the best prediction of the kmeans algorithm
k=6
predictions_kmeans_final = Kmeans.main(input_.copy(), format_.copy(), k, rounded=False)
predictions_nn_final = NN.main(input_.copy(), format_.copy())
concat_final = pd.concat([predictions_als_final, 
                    predictions_kmeans_final, 
                    predictions_nn_final, 
                    predictions_surprize_final], axis=1, sort=False)

In [None]:
concat_final.head()

In [None]:
concat_final["Prediction"] = rr.intercept_
for col in concat_final:
    if col != "Prediction":
        concat_final["Prediction"] += concat_final.loc[:, col]*predictor_coefficients[col]
concat_final["Prediction"] = concat_final["Prediction"].apply(lambda x: int(np.clip(np.round(x),1,5)))
concat_final.index.name = "Id"

In [None]:
concat_final.head()

In [None]:
concat_final.to_csv(submission_path, columns=["Prediction"])