# Imports

In [27]:
import numpy as np
import pandas as pd
from scipy import stats
#importing the models
import Kmeans
import ALS
import NN

# Final Functions

In [28]:
def median_vote(predictions):
    #computing the median
    median=np.median(predictions,axis=0)
    #making sure the result is an int (not the case if the amount of predictions is even)
    bounded_median=np.floor(median)
    return bounded_median

In [29]:
def mode_vote(predictions):
    #computing the mode
    return stats.mode(predictions,axis=0)[0]

In [79]:
def maj_vote(predictions):
    top=predictions.iloc[0][:,np.newaxis]
    mode=stats.mode(predictions,axis=0)
    freq=mode[1].T/predictions.shape[0]
    return np.where(freq<0.5,top,mode[0].T)

In [81]:
def vote(voting_f):
    #useful constants
    submission_path='submission.csv'
    training_path = "data/data_train.csv"
    format_path = "data/sampleSubmission.csv"
    #computing the prediction of the ALS algorithm
    predictions=ALS.main(training_path, format_path)
    #computing multiple predictions of the kmeans algorithm
    for k in [5,6,7]:
        predictions=predictions.merge(Kmeans.main(training_path, format_path, k),on='Id')
    #computing the prediction of the NN algorithm
    predictions=predictions.merge(NN.main(training_path, format_path),on='Id')
    #setting 'Id' as the index of the aggregation of predictions
    predictions.set_index('Id', inplace=True)
    #finding the best prediction through the voting function
    print('Voting...')
    predictions['Prediction']=voting_f(predictions.T)
    #exporting the final prediction using the submission path
    print('Exporting the final prediction...')
    predictions[['Prediction']].to_csv(submission_path)
    print('Done!')

In [82]:
vote(maj_vote)

Loading datasets
Spliting train/test
the shape of original ratings. (# of row, # of col): (10000, 1000)
Splitting progression: 100.0%
Total number of nonzero elements in original data:1,176,952
Total number of nonzero elements in train data:1,065,327
Total number of nonzero elements in test data:111,625
Trying to retrieve cached optimal matrix factorization
Successfully retrieved cached optimal matrix factorization
Emitting predictions 1176952/1176952
Kmeans for k= 5 :
The current iteration of k-means is: 42, the average loss is 114.58407192869866.
Kmeans for k= 6 :
The current iteration of k-means is: 24, the average loss is 113.83491094376913.
Kmeans for k= 7 :
The current iteration of k-means is: 49, the average loss is 113.39338894580459.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1059256 samples, validate on 117696 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Generating predictions
Voting...
Exporting the final prediction...
Done!


# Experimental Code

In [33]:
#useful constants
submission_path='submission.csv'
training_path = "data/data_train.csv"
format_path = "data/sampleSubmission.csv"

In [34]:
#computing the predictions of the ALS algorithm
predictions_als=ALS.main(training_path, format_path).set_index("Id")

Loading datasets
Spliting train/test
the shape of original ratings. (# of row, # of col): (10000, 1000)
Splitting progression: 100.0%
Total number of nonzero elements in original data:1,176,952
Total number of nonzero elements in train data:1,065,327
Total number of nonzero elements in test data:111,625
Trying to retrieve cached optimal matrix factorization
Successfully retrieved cached optimal matrix factorization
Emitting predictions 1176952/1176952


In [35]:
#computing the best prediction of the kmeans algorithm
k=5
predictions_kmeans = Kmeans.main(training_path, format_path, k).set_index("Id")

Kmeans for k= 5 :
The current iteration of k-means is: 42, the average loss is 114.58407192869866.


In [36]:
#computing the prediction of the NN algorithm
predictions_nn = NN.main(training_path, format_path).set_index("Id")

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1059256 samples, validate on 117696 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Generating predictions


In [80]:
#finding the best prediction through the voting function
concat = pd.merge(pd.merge(predictions_kmeans, predictions_als, left_index=True, right_index=True), predictions_nn, left_index=True, right_index=True)
concat.columns = ["Pred1", "Pred2", "Pred3"]
#finding the best prediction through the voting function
voting_f=maj_vote
concat['Prediction']=voting_f(concat.T)
concat.drop(["Pred1", "Pred2", "Pred3"], axis=1, inplace=True)
concat

Unnamed: 0_level_0,Prediction
Id,Unnamed: 1_level_1
r1000_c1,4.0
r1141_c1,4.0
r1146_c1,4.0
r1157_c1,3.0
r1184_c1,4.0
...,...
r9949_c999,4.0
r9961_c999,3.0
r9965_c999,3.0
r9988_c999,4.0


In [None]:
#exporting the final prediction using the submission path
concat.to_csv(submission_path)