# Imports

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
#importing the models
import Kmeans
import ALS
import NN
import sklearn
import matplotlib.pyplot as plt

Using TensorFlow backend.


# Final Functions

In [2]:
def median_vote(predictions):
    #computing the median
    median=np.median(predictions,axis=0)
    #making sure the result is an int (not the case if the amount of predictions is even)
    bounded_median=np.floor(median)
    return bounded_median

In [3]:
def mode_vote(predictions):
    #computing the mode
    return stats.mode(predictions,axis=0)[0]

In [4]:
def maj_vote(predictions):
    top=predictions.iloc[0][:,np.newaxis]
    mode=stats.mode(predictions,axis=0)
    freq=mode[1].T/predictions.shape[0]
    return np.where(freq<0.5,top,mode[0].T)

In [5]:
def cluster_vote(predictions):
    return np.round(np.mean(predictions,0))

In [6]:
def vote(voting_f):
    #useful constants
    submission_path='submission.csv'
    training_path = "data/data_train.csv"
    format_path = "data/sampleSubmission.csv"
    #Loading the data
    print("Loading datasets")
    try:
        input_ = pd.read_csv(training_path)
        format_ = pd.read_csv(format_path)
    except FileNotFoundError:
        print("Impossible to load training or format files, "
              "please double check")
        return pd.DataFrame([])
    #computing the prediction of the ALS algorithm
    predictions=ALS.main(input_.copy(), format_.copy())
    #computing multiple predictions of the kmeans algorithm
    for k in [6]:
        predictions=predictions.merge(Kmeans.main(input_.copy(), format_.copy(), k),on='Id')
        print(predictions.shape)
    #computing the prediction of the NN algorithm
    predictions=predictions.merge(NN.main(input_.copy(), format_.copy()),on='Id')
    #setting 'Id' as the index of the aggregation of predictions
    predictions.set_index('Id', inplace=True)
    #finding the best prediction through the voting function
    print('Voting...')
    predictions['Prediction']=voting_f(predictions.T)
    #exporting the final prediction using the submission path
    print('Exporting the final prediction...')
    predictions[['Prediction']].to_csv(submission_path)
    print('Done!')
    return predictions[['Prediction']]

In [7]:
coco=vote(cluster_vote)

Loading datasets
Trying to retrieve cached optimal matrix factorization
Successfully retrieved cached optimal matrix factorization
Emitting predictions 1176952/1176952
Kmeans for k= 6 :
The current iteration of k-means is: 24, the average loss is 113.83491094376913.
(1176952, 3)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1059256 samples, validate on 117696 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Generating predictions
Voting...
Exporting the final prediction...
Done!


# Cross-Validation

In [None]:
#useful constants
submission_path='submission.csv'
training_path = "data/data_train.csv"
format_path = "data/sampleSubmission.csv"

In [None]:
#Loading the data
print("Loading datasets")
try:
    input_ = pd.read_csv(training_path)
    format_ = pd.read_csv(format_path)
except FileNotFoundError:
    print("Impossible to load training or format files, "
          "please double check")

In [None]:
#Splitting the data
train, test =sklearn.model_selection.train_test_split(input_,test_size=0.1)

In [None]:
#computing the predictions of the ALS algorithm
predictions_als=ALS.main(train.copy(), test.copy()).set_index("Id")

In [None]:
#computing the best prediction of the kmeans algorithm
k=1
predictions_kmeans = Kmeans.main(train.copy(), test.copy(), k).set_index("Id")

In [None]:
predictions_als.sort_values(by='Id').loc['r10000_c1000'],test.set_index('Id').sort_values(by='Id')

In [None]:
#computing the prediction of the NN algorithm
predictions_nn = NN.main(train.copy(), test.copy()).set_index("Id")

In [None]:
#putting all the predictions in a same table
concat = pd.merge(pd.merge(predictions_kmeans, predictions_als, left_index=True, right_index=True), predictions_nn, left_index=True, right_index=True)
concat.columns = ["Pred1", "Pred2", "Pred3"]
#approximating the rmse through cross-validation
voting_f=cluster_vote
concat['Prediction']=voting_f(concat[["Pred1", "Pred2", "Pred3"]].T,[w1,w2,w3])
print(np.mean((test.set_index("Id")-concat[['Prediction']])**2)**(1/2))
concat.drop(["Pred1", "Pred2", "Pred3"], axis=1, inplace=True)
concat.head()