# Final Ensemble Model

In this notebook we produce the final submission file, consisting of the average of the predictions from a pLSA model, a VAE and an SVD++ model. 

In [42]:
import numpy as np
import pandas as pd

In [None]:
# Define a helper function

def clean_df(df):
    '''
    Cleans initial representation to separate rows (users) and columns (movies) into columns with integer values
    '''
    row_str = df["Id"].apply(lambda x: x.split("_")[0])
    row_id = row_str.apply(lambda x: int(x.split("r")[1]) - 1)
    col_str = df["Id"].apply(lambda x: x.split("_")[1])
    col_id = col_str.apply(lambda x: int(x.split("c")[1]) - 1)
    
    data_df = pd.DataFrame(data = {'row': row_id, 'col': col_id, 'Prediction': df.loc[:,'Prediction']})
    
    return data_df

In [51]:
# import submission csv's from models

path_svdpp = "../data/sgdpp2-14.csv" #modify?
model_svdpp = pd.read_csv(path_svdpp)

path_vae = "../data/preds-vae.csv" #modify?
model_vae = pd.read_csv(path_vae)

path_plsa = "../data/submission_20200623_1142.csv" #modify?
model_plsa = pd.read_csv(path_plsa)

path_sample = "../data/sample-Submission.csv"
submission = pd.read_csv(path_sample)

In [52]:
# represent with separated columns

preds_svdpp = clean_df(model_svdpp)
preds_vae = clean_df(model_vae)
preds_plsa = clean_df(model_plsa)
preds_submission = utils.clean_df(submission)
assert preds_svdpp.shape == preds_vae.shape
assert preds_vae.shape == preds_submission.shape
assert preds_svdpp.shape == preds_submission.shape

## Take a look at the submodel predictions

In [53]:
model_vae.head()

Unnamed: 0,Id,Prediction
0,r1_c4,3.800149
1,r1_c8,3.513709
2,r1_c21,3.314896
3,r1_c102,4.080699
4,r1_c127,3.332319


In [55]:
model_plsa.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3.174658
1,r73_c1,2.965754
2,r156_c1,3.859685
3,r160_c1,2.960093
4,r248_c1,3.531102


In [56]:
model_svdpp.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3.353905
1,r73_c1,3.247953
2,r156_c1,3.780025
3,r160_c1,3.420531
4,r248_c1,3.550057


## Average predictions for ensemble

In [57]:
# create ensemble predictions
# take average as new prediction

# first order in same way
preds_vae.sort_values(by = ["col", "row"], inplace = True)
preds_svdpp.sort_values(by = ["col", "row"], inplace = True)
preds_plsa.sort_values(by = ["col", "row"], inplace = True)
preds_submission.sort_values(by = ["col", "row"], inplace = True)

preds = np.mean(np.array([preds_vae["Prediction"].values, 
                          preds_svdpp["Prediction"].values,
                          preds_plsa["Prediction"].values]), axis = 0)

preds_submission["Prediction"] = preds

preds_submission.head() 

Unnamed: 0,row,col,Prediction
0,36,0,3.289629
1,72,0,3.181528
2,155,0,3.908686
3,159,0,3.291031
4,247,0,3.435881


## Export prediction csv

In [58]:
submission["Prediction"] = preds_submission["Prediction"]
submission.to_csv("ensemble_svdpp_vae_plsa.csv", index = False)