In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import time
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
#import csv
import sys as sys
from numpy import array
from numpy import argmax
from operator import itemgetter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from collections import defaultdict

torch.manual_seed(1)

<torch._C.Generator at 0x7f646c06ab50>

In [2]:
def remove_single_actions(df):
  df = df.drop(df[(df['action_type'] == "clickout item") & (df['step'] == 1)].index)
  return df
  
def remove_test_single_actions(df_test, df_gt):
  df_sessions = df_test.groupby('session_id')

  for group_name, df_group in df_sessions:
    session_len = 0

    for action_index, action in df_group.iterrows():
      session_len = session_len + 1
    
    if session_len == 1:
      df_test = df_test.drop(df_test[df_test['session_id'] == action['session_id']].index)
      df_gt = df_gt.drop(df_gt[df_gt['session_id'] == action['session_id']].index)
      #df = df.drop(df[(df['action_type'] == "clickout item") & (df['step'] == 1)].index)
    
  return df_test, df_gt  
  
def remove_nonitem_actions(df):
  df = df.drop(df[(df['action_type'] != 'interaction item image') & (df['action_type'] != 'interaction item deals') & (df['action_type'] != 'clickout item') & (df['action_type'] != 'search for item')].index)
  return df

def reduce_df(df, dim):
  df = df.head(dim)
  return pd.DataFrame(df)

def get_corpus(df):
  session_id = ''
  temp_session = []
  splitted_sessions = []
  impressions = []

  for action_index, action in df.iterrows():
    if session_id == '':
      session_id = action['session_id']

    if session_id != action['session_id']:
      splitted_sessions.append(temp_session)
      splitted_sessions = splitted_sessions + impressions
      temp_session = []
      impressions = []

    temp_session.append(action['reference'])
    session_id = action['session_id']
    
    if action['action_type'] == 'clickout item':
      impressions.append(action['impressions'].split('|')[:8])
      

  return splitted_sessions

def generate_prices_sparse_matrix(df, features_col='intervals'):
    df['present'] = 1
    hotel_dict = create_item_dict(df) #Controllare che sia uguale all'altro dizionario
    feature_dict = create_item_dict(df, col_name='feature')
    list_hotel = list(df['reference'])
    list_features = list(df['feature'])
    list_data = list(df['present'])
    n_items = len(list_hotel)
    n_features = len(list_features)
    # Convert each list of string in a list of indexes
    list_items = list(map(lambda x: hotel_dict[x], list_hotel))
    list_features = list(map(lambda x: feature_dict[x], list_features))
    # Generate the sparse matrix
    row = np.array(list_items)
    col = np.array(list_features)
    data = np.array(list_data)
    csr = csr_matrix((data, (row, col)), shape=(n_items, n_features))

    return csr, hotel_dict
  
def get_hotel_prices(df_metadata, n_categories = 2000):
    """
    Required Input -
        - metadata_file = file with the average price for each hotel
    """
    #print("Reading metadata: " + metadata_file)
    df_metadata['price'] = df_metadata['price'].apply(lambda x: math.log10(x))
    # Define the range
    max_price = df_metadata['price'].max()
    min_price = df_metadata['price'].min()
    range = (max_price - min_price) / n_categories
    # Generate the classes
    df_metadata['intervals'] = pd.cut(df_metadata['price'], bins=np.arange(min_price,max_price,range))
    df_metadata.loc[:, 'intervals'] = df_metadata['intervals'].apply(str)
    #classes_dic = create_user_dict(df_metadata, col_name = 'intervals')
    #df_metadata.loc[:, 'intervals'] = df_metadata['intervals'].apply(lambda x : classes_dic.get(x))
    #df_metadata.loc[:, 'intervals'] = df_metadata['intervals'].apply(int)
    # Create a dictionary of item_id -> price_category
    price_dic = pd.Series(df_metadata.intervals.values,index=df_metadata.impressions).to_dict()


    return price_dic

In [3]:
df_encode = pd.read_csv("./encode_1.csv")
df_encode = remove_single_actions(df_encode)
df_encode = remove_nonitem_actions(df_encode)

In [4]:
len(df_encode)

133689

In [5]:
#df_encode = reduce_df(df_encode, 80000)

In [6]:
df_train = pd.read_csv("./train_1.csv")
df_train = remove_single_actions(df_train)
df_train =  remove_nonitem_actions(df_train)

In [7]:
#df_train = reduce_df(df_train, 10000)

In [8]:
df_test = pd.read_csv("./test_1.csv")
df_test = remove_single_actions(df_test)
df_test = remove_nonitem_actions(df_test)

In [9]:
#df_test = reduce_df(df_test, 1000)

In [10]:
df_gt = pd.read_csv("./gt_1.csv")

In [11]:
#df_gt = reduce_df(df_gt, 1000)

In [12]:
df_test, df_gt = remove_test_single_actions(df_test, df_gt)

In [13]:
df_meta = pd.read_csv("./item_metadata.csv")

In [14]:
df_prices = pd.read_csv("./hotel_prices.csv")

In [15]:
price_dict = get_hotel_prices(df_prices, n_categories = 100)

In [16]:
len(list(set(price_dict.values())))

100

In [17]:
corpus = get_corpus(df_encode)

gensim trial

In [18]:
from gensim.models import Word2Vec

In [19]:
def normalize_word2vec(word2vec):
  hotels_pre_norm = []

  for hotel in word2vec.wv.index2word:
    hotels_pre_norm.append(word2vec.wv[hotel].tolist())

  hotels_pre_norm = np.asarray(hotels_pre_norm)
  hotels_post_norm = normalize(hotels_pre_norm, norm='l2', axis=0, copy=True, return_norm=False)
  
  hotels_post_norm = hotels_post_norm.tolist()

  for hotel in word2vec.wv.index2word:
    word2vec.wv[hotel] = np.asarray(hotels_post_norm[0])
    hotels_post_norm.pop(0)
    
  return word2vec

In [20]:
word2vec = Word2Vec(corpus, min_count=1, window=3, sg=1) 

In [21]:
n_features = len(word2vec.wv['666856'])
n_features

100

In [22]:
word2vec.wv['666856']

array([-1.43385236e-03, -5.85974567e-03,  5.68891130e-03,  2.70580947e-02,
        1.29421921e-02,  2.66364329e-02, -8.32236721e-04,  1.46591952e-02,
       -2.99845990e-02, -7.90513400e-03, -3.40466737e-03,  2.26807594e-02,
       -1.98431052e-02,  1.89517569e-02,  1.93012320e-02,  3.63550469e-04,
       -3.90568711e-02,  1.29658533e-02,  3.23108844e-02,  1.14307953e-02,
        7.28571601e-03,  1.83826890e-02, -2.41642371e-02,  2.62602814e-03,
        2.24970169e-02,  1.22538442e-02, -2.71763243e-02, -3.01339999e-02,
       -3.05952989e-02,  4.40039113e-03,  5.37967728e-03,  4.52082109e-04,
       -1.24047883e-02,  3.16534601e-02, -8.49470869e-03,  5.11352764e-03,
       -1.89415533e-02,  7.47018377e-04,  5.26629388e-03,  7.59479590e-03,
       -1.87180862e-02,  1.33755943e-02,  6.46037795e-03, -7.92148802e-03,
       -1.96938310e-02,  1.94945000e-02,  2.76693050e-02,  1.03600519e-02,
        1.77046936e-02, -4.33681570e-02, -1.98492501e-02,  2.74053053e-03,
       -3.19321500e-03, -

In [23]:
word2vec.wv.most_similar(positive = '4102552')

[('8242182', 0.9504700899124146),
 ('5502296', 0.9466519355773926),
 ('6785', 0.943709135055542),
 ('1369270', 0.9411848783493042),
 ('101932', 0.9406952261924744),
 ('44355', 0.9406229257583618),
 ('5050642', 0.9405085444450378),
 ('5104290', 0.9395586252212524),
 ('2805646', 0.9393075704574585),
 ('4493122', 0.9390658736228943)]

In [24]:
#hotel_dict = normalize_word2vec(word2vec.wv)
hotel_dict = word2vec.wv

In [25]:
word2vec.wv['666856']

array([-1.43385236e-03, -5.85974567e-03,  5.68891130e-03,  2.70580947e-02,
        1.29421921e-02,  2.66364329e-02, -8.32236721e-04,  1.46591952e-02,
       -2.99845990e-02, -7.90513400e-03, -3.40466737e-03,  2.26807594e-02,
       -1.98431052e-02,  1.89517569e-02,  1.93012320e-02,  3.63550469e-04,
       -3.90568711e-02,  1.29658533e-02,  3.23108844e-02,  1.14307953e-02,
        7.28571601e-03,  1.83826890e-02, -2.41642371e-02,  2.62602814e-03,
        2.24970169e-02,  1.22538442e-02, -2.71763243e-02, -3.01339999e-02,
       -3.05952989e-02,  4.40039113e-03,  5.37967728e-03,  4.52082109e-04,
       -1.24047883e-02,  3.16534601e-02, -8.49470869e-03,  5.11352764e-03,
       -1.89415533e-02,  7.47018377e-04,  5.26629388e-03,  7.59479590e-03,
       -1.87180862e-02,  1.33755943e-02,  6.46037795e-03, -7.92148802e-03,
       -1.96938310e-02,  1.94945000e-02,  2.76693050e-02,  1.03600519e-02,
        1.77046936e-02, -4.33681570e-02, -1.98492501e-02,  2.74053053e-03,
       -3.19321500e-03, -

In [36]:
#preparing training data

#gets the training set and splits it in subsessions populated by the item of the action
def prepare_input(df_train):
  training_set = []
  category_set = []
  hotels_window_set = []
  
  df_sessions = df_train.groupby('session_id')

  for group_name, df_group in df_sessions:
    sub_sessions = []
    categories = []
    temp_session = []
    hotels_window = []

    for action_index, action in df_group.iterrows():
      if action['action_type'] == 'clickout item':
        sub_sessions.append(temp_session)
        temp_session.append(action)
        categories.append(action['reference'])
        hotels_window.append(action['impressions'].split('|'))
      else:
        temp_session.append(action)
        
    #training_set.concatenate(sub_sessions)
    #category_set.concatenate(categories)
    #hotels_window_set.concatenate(hotels_window)
    training_set = training_set + sub_sessions
    category_set = category_set + categories
    hotels_window_set = hotels_window_set + hotels_window
    
    
  return training_set, category_set, hotels_window_set

#gets the training set and splits it in subsessions populated by the item of the action
def prepare_input_batched(df_train, batch_size):
  training_set = []
  category_set = []
  hotels_window_set = []

  training_set_batched = []
  category_set_batched = []
  hotels_window_set_batched = []

  df_sessions = df_train.groupby('session_id')

  for group_name, df_group in df_sessions:
    sub_sessions = []
    categories = []
    temp_session = []
    hotels_window = []

    for action_index, action in df_group.iterrows():
      if action['action_type'] == 'clickout item':
        sub_sessions.append(temp_session)
        temp_session.append(action)
        categories.append(action['reference'])
        hotels_window.append(action['impressions'].split('|'))
      else:
        temp_session.append(action)
        
    #training_set.concatenate(sub_sessions)
    #category_set.concatenate(categories)
    #hotels_window_set.concatenate(hotels_window)
    training_set = training_set + sub_sessions
    category_set = category_set + categories
    hotels_window_set = hotels_window_set + hotels_window
  
  temp_session_batched = []
  temp_category_batched = []
  temp_hotel_window_batched = []
  
  for si, session in enumerate(training_set):
    temp_session_batched.append(session)
    temp_category_batched.append(category_set[si])
    temp_hotel_window_batched.append(hotels_window_set[si])
  
    if len(temp_session_batched) == batch_size:
      training_set_batched.append(temp_session_batched)
      category_set_batched.append(temp_category_batched)
      hotels_window_set_batched.append(temp_hotel_window_batched)
      temp_session_batched = []
      temp_category_batched = []
      temp_hotel_window_batched = []

  if len(temp_session_batched) != 0:
    training_set_batched.append(temp_session_batched)
    category_set_batched.append(temp_category_batched)
    hotels_window_set_batched.append(temp_hotel_window_batched)
    
    
  return training_set_batched, category_set_batched, hotels_window_set_batched

def prepare_test(df_test, df_gt):
  #Creating a NaN column for item recommendations
  df_test['item_recommendations'] = np.nan

  test_dim = len(df_test)

  temp_session = []
  test_sessions = []

  temp_clickout_index = []
  test_clickout_index = []

  temp_hotels_window = []
  test_hotels_window = []

  i = 0
  step = 0

  #splitting in sessions while evaluating recommendations for NaN clickouts
  for action_index, action in df_test.iterrows():
      if(action['reference'] != 'unknown'):
          if (action['action_type'] == 'clickout item') & math.isnan(float(action['reference'])):
              temp_hotels_window = action['impressions'].split('|')
              temp_session.append(action)
              temp_clickout_index.append(action_index)
          else:
              temp_session.append(action)

      if(i < test_dim-1):
          if action['session_id'] != df_test.iloc[[i + 1]]['session_id'].values[0]:
              step = 0
              test_sessions.append(temp_session)
              test_hotels_window.append(temp_hotels_window)
              test_clickout_index.append(temp_clickout_index)
              temp_session = []
              temp_hotels_window = []
              temp_clickout_index = []


      i = i+1  
      step = step + 1
        
  return test_sessions, test_hotels_window, test_clickout_index
        

In [27]:
def recommendations_from_output(output, hotel_dict, hotels_window, n_features):
  i = 0
  window_dict = {}
  
  output_arr = np.asarray(output[0].cpu().detach().numpy())
    
  ranked_hotels = {}
  
  #for hotel_k, hotel_t in window_dict.items():
  #  d = distance(output, hotel_t)
  #  ranked_hotels[hotel_k] = d
  
  #hotel_scores = {}
  hotel_i = 0
  missed = 0
  #print(len(output_arr))
  
  for hotel_v in output_arr:
    #print(hotel_v)
    hotel_id = hotel_dict.index2word[hotel_i]
    #print(hotel_id)
    #print(hotel_id)
    #print(hotels_window)
    if hotel_id in hotels_window:
      #print('found')
      ranked_hotels[hotel_id] = hotel_v
    hotel_i = hotel_i + 1
  
  for hotel_id in hotels_window:
    if hotel_id not in ranked_hotels:
      ranked_hotels[hotel_id] = -999999
      #print(hotel_id)
      missed = missed + 1
  #print(str(len(hotels_window)) + ' - ' + str(missed))

  #ranked_hotels = sorted(ranked_hotels)
  ranked_hotels = sorted(ranked_hotels.items(), key=itemgetter(1), reverse = True)
  #print(ranked_hotels)
  #print(hotels_window)
  #print(ranked_hotels)
  #print(list_to_space_string(ranked_hotels))
  ranked = []
  for tup in ranked_hotels:
    ranked.append(tup[0])
    
  #print(ranked_hotels)
                           
                           
  return list_to_space_string(ranked)


# Just return an output given a line
def evaluate(session, hotel_dict, n_features, hotels_window, max_window):
    #hidden = rnn.initHidden()
    hidden = torch.zeros(1, 1, n_hidden)
    c = torch.zeros(1, 1, n_hidden)
    
    #print(session)
    #print(max_window)
    line_tensor = session_to_tensor(session, hotels_window, max_window).cuda()
    
    output = model(line_tensor)
        
    #print(output)
    output = recommendations_from_output(output, hotel_dict, hotels_window, n_features)

    return output
  
  
def recommendations_from_output_debug(output, hotel_dict, hotels_window, n_features):
  i = 0
  window_dict = {}
  
  output_arr = np.asarray(output[0].cpu().detach().numpy())
    
  ranked_hotels = {}
  
  #for hotel_k, hotel_t in window_dict.items():
  #  d = distance(output, hotel_t)
  #  ranked_hotels[hotel_k] = d
  
  #hotel_scores = {}
  hotel_i = 0
  
  print(len(output_arr))
  print(hotels_window)
  
  for hotel_index, hotel_v in enumerate(output_arr):
    #print(hotel_v)
    hotel_id = hotel_dict.index2word[hotel_index]
    #print(hotel_id)
    #print(hotel_id)
    #print(hotels_window)
    if hotel_id in hotels_window:
      #print('found')
      ranked_hotels[hotel_id] = hotel_v
    hotel_i = hotel_i + 1
  
  for hotel_id in hotels_window:
    if hotel_id not in ranked_hotels:
      ranked_hotels[hotel_id] = -999999
      print('cant find ' + str(i) + ' sessions')
  
  #ranked_hotels = sorted(ranked_hotels)
  ranked_hotels = sorted(ranked_hotels.items(), key=itemgetter(1), reverse = True)
  #print(ranked_hotels)
  #print(hotels_window)
  #print(ranked_hotels)
  #print(list_to_space_string(ranked_hotels))
  ranked = []
  for tup in ranked_hotels:
    ranked.append(tup[0])
    
  print(ranked_hotels)
                           
                           
  return list_to_space_string(ranked)
  
def evaluate_debug(session, hotel_dict, n_features, hotels_window, max_window):
    #hidden = rnn.initHidden()
    hidden = torch.zeros(1, 1, n_hidden)
    c = torch.zeros(1, 1, n_hidden)
    
    #print(session)
    #print(max_window)
    line_tensor = session_to_tensor(session, hotels_window, max_window).cuda()
    print(line_tensor)

    output = model(line_tensor)
        
    print(output)
    output = recommendations_from_output_debug(output, hotel_dict, hotels_window, n_features)

    return output
  
def recommendations_from_output_debug_false(output, hotel_dict, hotels_window, n_features):
  i = 0
  window_dict = {}
  
  output_arr = np.asarray(output[0].cpu().detach().numpy())
    
  ranked_hotels = {}
  
  #for hotel_k, hotel_t in window_dict.items():
  #  d = distance(output, hotel_t)
  #  ranked_hotels[hotel_k] = d
  
  #hotel_scores = {}
  hotel_i = 0
  
  print(len(output_arr))
  print(hotels_window)
  
  for hotel_index, hotel_v in enumerate(output_arr):
    #print(hotel_v)
    hotel_id = hotel_dict.index2word[hotel_index]
    #print(hotel_id)
    #print(hotel_id)
    #print(hotels_window)
    if hotel_id in hotels_window:
      #print('found')
      ranked_hotels[hotel_id] = hotel_v
    hotel_i = hotel_i + 1
  
  for hotel_id in hotels_window:
    if hotel_id not in ranked_hotels:
      ranked_hotels[hotel_id] = 0
      #print('cant find ' + str(i) + ' sessions')
  
  #ranked_hotels = sorted(ranked_hotels)
  ranked_hotels = sorted(ranked_hotels.items(), key=itemgetter(1), reverse = False)
  #print(ranked_hotels)
  #print(hotels_window)
  #print(ranked_hotels)
  #print(list_to_space_string(ranked_hotels))
  ranked = []
  for tup in ranked_hotels:
    ranked.append(tup[0])
    
  print(ranked_hotels)
                           
                           
  return list_to_space_string(ranked)
  
def evaluate_debug_false(session, hotel_dict, n_features, hotels_window, max_window):
    #hidden = rnn.initHidden()
    hidden = torch.zeros(1, 1, n_hidden)
    c = torch.zeros(1, 1, n_hidden)
    
    #print(session)
    #print(max_window)
    line_tensor = session_to_tensor(session, hotels_window, max_window).cuda()
    output = model(line_tensor)
        
    print(output)
    output = recommendations_from_output_debug_false(output, hotel_dict, hotels_window, n_features)

    return output
  
def get_submission_target(df):
    """Identify target rows with missing click outs."""

    mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
    df_out = df[mask]

    return df_out  

def get_reciprocal_ranks(ps):
    """Calculate reciprocal ranks for recommendations."""
    mask = ps.reference == np.array(ps.item_recommendations)

    if mask.sum() == 1:
        rranks = generate_rranks_range(0, len(ps.item_recommendations))
        return np.array(rranks)[mask].min()
    else:
        return 0.0


def score_submissions(subm_csv, gt_csv, objective_function):
    """Score submissions with given objective function."""

    print(f"Reading ground truth data {gt_csv} ...")
    df_gt = read_into_df(gt_csv)

    print(f"Reading submission data {subm_csv} ...")
    df_subm = read_into_df(subm_csv)

    # create dataframe containing the ground truth to target rows
    cols = ['reference', 'impressions', 'prices']
    df_key = df_gt.loc[:, cols]

    # append key to submission file
    df_subm_with_key = df_key.join(df_subm, how='inner')
    df_subm_with_key.reference = df_subm_with_key.reference.astype(int)
    df_subm_with_key = convert_string_to_list(
        df_subm_with_key, 'item_recommendations', 'item_recommendations'
    )

    # score each row
    df_subm_with_key['score'] = df_subm_with_key.apply(objective_function, axis=1)
    mrr = df_subm_with_key.score.mean()

    return mrr
  
def generate_rranks_range(start, end):
    """Generate reciprocal ranks for a given list length."""

    return 1.0 / (np.arange(start, end) + 1)
  
def convert_string_to_list(df, col, new_col):
    """Convert column from string to list format."""
    fxn = lambda arr_string: [int(item) for item in str(arr_string).split(" ")]

    mask = ~(df[col].isnull())

    df[new_col] = df[col]
    df.loc[mask, new_col] = df[mask][col].map(fxn)

    return df


def get_reciprocal_ranks(ps):
    """Calculate reciprocal ranks for recommendations."""
    mask = ps.reference == np.array(ps.item_recommendations)

    if mask.sum() == 1:
        rranks = generate_rranks_range(0, len(ps.item_recommendations))
        return np.array(rranks)[mask].min()
    else:
        return 0.0
  

def score_submissions_no_csv(df_subm, df_gt, objective_function):
    # create dataframe containing the ground truth to target rows
    cols = ['reference', 'impressions', 'prices']
    df_key = df_gt.loc[:, cols]

    # append key to submission file
    df_subm_with_key = df_key.join(df_subm, how='inner')
    df_subm_with_key.reference = df_subm_with_key.reference.astype(int)
    df_subm_with_key = convert_string_to_list(
        df_subm_with_key, 'item_recommendations', 'item_recommendations'
    )

    # score each row
    df_subm_with_key['score'] = df_subm_with_key.apply(objective_function, axis=1)
    mrr = df_subm_with_key.score.mean()

    return mrr
  
  
def test_accuracy(model, df_test, df_gt):
  df_test['item_recommendations'] = np.nan

  test_dim = len(df_test)
  temp_session = []
  hotels_window = []
  i = 0
  print_every = 500
  step = 0
  #df_result = pd.DataFrame(index = [0], columns=df_test.columns)
  #print(df_result)

  for action_index, action in df_test.iterrows():

    #print(action)
    #print('step ' + str(step))
    #print(len(temp_session)) 
    if(action['reference'] != 'unknown'):
      if (action['action_type'] == 'clickout item') & math.isnan(float(action['reference'])):
        hotels_window = action['impressions'].split('|')

        #print('window is ' + str(hotels_window))
        #print(len(temp_session)) 

        if len(temp_session) != 0:
          #print('doing sub')
          #print(evaluate(temp_session, hotel_dict, n_features, hotels_window, distance))
          df_test.loc[action_index, 'item_recommendations'] = evaluate(temp_session, hotel_dict, n_features, hotels_window, max_window)
        #print(p.o)
        temp_session.append(action)
        #print('added click')
      else:
        temp_session.append(action)
        #print(temp_session)
        #print('added action')

    if(i < test_dim-1):
      if action['session_id'] != df_test.iloc[[i + 1]]['session_id'].values[0]:
        step = 0
        #print(temp_session)
        #print(hotels_window)
        #print(p.r)
        temp_session = []
        hotels_window = []

    i = i+1  
    step = step + 1
    
    
  df_sub = get_submission_target(df_test)
  df_sub = df_sub[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]
  
  for action_index, action in df_gt.iterrows():
    if action_index not in df_sub.index.values.tolist():
      df_gt = df_gt.drop(action_index)

  mask = df_sub["item_recommendations"].notnull()
  df_sub = df_sub[mask]
  
  mrr = score_submissions_no_csv(df_sub, df_gt, get_reciprocal_ranks)
  return mrr

def test_accuracy_optimized(model, df_test, df_gt, sessions, hotels_window, clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list, subname="submission_default_name", isprint=False):
  """Return the score obtained by the net on the test dataframe"""

  test_dim = len(df_test)

  print_every = 500
  
  missed_target = 0

  for session_index, session in enumerate(sessions):
    if clickout_index[session_index] != []:
      df_test.loc[clickout_index[session_index], 'item_recommendations'] = evaluate(session, hotel_dict, n_features, hotels_window[session_index], max_window)
      #print(df_gt[(df_gt['session_id'] == df_test.loc[clickout_index[session_index], 'session_id'].values[0]) & (df_gt['step'] == df_test.loc[clickout_index[session_index], 'step'].values[0])]['reference'].values[0])
      #target = df_gt[(df_gt['session_id'] == df_test.loc[clickout_index[session_index], 'session_id'].values[0]) & (df_gt['step'] == df_test.loc[clickout_index[session_index], 'step'].values[0])]['reference'].values[0]
      #print(type(target))
      #if int(target) not in hotel_dict: 
        #missed_target = missed_target + 1
        #print(session_index)
  df_sub = get_submission_target(df_test)

  print(str(missed_target) + ' correct hotels were not in dictionary - total: ' + str(len(hotels_window)))
  
  #Removing unnecessary columns
  df_sub = df_sub[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]

  mask = df_sub["item_recommendations"].notnull()
  df_sub = df_sub[mask]

  # Saving df_sub
  if isprint:
      df_sub.to_csv('./' + subname + '.csv')

  mrr = score_submissions_no_csv(df_sub, df_gt, get_reciprocal_ranks)
  return mrr


def test_accuracy_optimized_false(model, df_test, df_gt, sessions, hotels_window, clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list, subname="submission_default_name", isprint=False):
  """Return the score obtained by the net on the test dataframe"""

  test_dim = len(df_test)

  print_every = 500


  for session_index, session in enumerate(sessions):
    if clickout_index[session_index] != []:
      df_test.loc[clickout_index[session_index], 'item_recommendations'] = evaluate_debug_false(session, hotel_dict, n_features, hotels_window[session_index], max_window)
      #print(session_index)
  df_sub = get_submission_target(df_test)

  #Removing unnecessary columns
  df_sub = df_sub[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]

  mask = df_sub["item_recommendations"].notnull()
  df_sub = df_sub[mask]

  # Saving df_sub
  if isprint:
      df_sub.to_csv('./' + subname + '.csv')

  mrr = score_submissions_no_csv(df_sub, df_gt, get_reciprocal_ranks)
  return mrr

In [28]:
#acc = test_accuracy_optimized(model, df_test, df_gt, test_sessions, test_hotels_window, test_clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list)

1

In [29]:

'''
STEP 3: CREATE MODEL CLASS
'''
 
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, bias=True):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
         
        # Number of hidden layers
        self.layer_dim = layer_dim
               
        self.lstm = nn.LSTM(input_size = input_dim, hidden_size = hidden_dim, num_layers = layer_dim)  
        
        self.hidden_fc = nn.Linear(hidden_dim, hidden_dim * 10)
        
        #self.hidden_fc2 = nn.Linear(hidden_dim * 10, hidden_dim * 100)
        
        self.fc = nn.Linear(hidden_dim * 10, output_dim)
        
        self.dropout_layer = nn.Dropout(p=0.2)
        
        self.softmax = nn.LogSoftmax(1)
    
    
    def forward(self, x):
        
        # Initialize hidden state with zeros
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        #print(x.shape,"x.shape")100, 28, 28
        if torch.cuda.is_available():
            h0 = torch.zeros(self.layer_dim, x.size(1), self.hidden_dim).cuda()
        else:
            h0 = torch.zeros(self.layer_dim, x.size(1), self.hidden_dim)

        # Initialize cell state
        if torch.cuda.is_available():
            c0 = torch.zeros(self.layer_dim, x.size(1), self.hidden_dim).cuda()
        else:
            c0 = torch.zeros(self.layer_dim, x.size(1), hidden_dim)

        
        #cn = c0[0,:,:]
        #hn = h0[0,:,:]

        #for seq in range(x.size(1)):
        #    hn, cn = self.lstm(x[:,seq,:], (hn,cn)) 
        #    outs.append(hn)
            
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        #out = self.fc(out)
        
        out = out[-1, :, :]
        
        out = F.relu(self.hidden_fc(out))
        
        #out = F.relu(self.hidden_fc2(out))
        
        out = self.dropout_layer(out)
       
        
        #out = out[]
        
        out = self.fc(out)
        
        #out = self.softmax(out)
        
        #out = self.fc(out) 
        # out.size() --> 100, 10
        return out

In [34]:
batchsize = 8

In [38]:
if batchsize == 0:
    #this splits the training set sessions into multiple mini-sessions
    sessions, categories, hotels_window = prepare_input(df_train)
else:
    sessions, categories, hotels_window = prepare_input_batched(df_train, batchsize)

In [39]:
test_sessions, test_hotels_window, test_clickout_index = prepare_test(df_test, df_gt)

In [40]:
max_window = 0
for window in hotels_window:
  if len(window) > max_window:
    max_window = len(window)
for window in test_hotels_window:
  if len(window) > max_window:
    max_window = len(window)
max_window = 0

In [41]:
meta_dict = []
meta_list = []
price_dict = []
n_hotels = len(hotel_dict.index2word)
n_features_w2vec = len(word2vec.wv['666856'])
n_features_impression = max_window
n_features_prices = 0
n_features = n_features_w2vec + n_features_impression + n_features_prices

print('n_hotels is ' + str(n_hotels))
print('n_features_w2vec is ' + str(n_features_w2vec))
print('n_features_impression is ' + str(n_features_impression))
print('n_features_prices is ' + str(n_features_prices))
print('n_features is ' + str(n_features))

n_hotels is 45148
n_features_w2vec is 100
n_features_impression is 0
n_features_prices is 0
n_features is 100


In [42]:
input_dim = n_features
output_dim = n_hotels
#hidden_dim = int(1/10 * (input_dim + output_dim))
hidden_dim = 100
print('hidden_dim is ' + str(hidden_dim))
layer_dim = 1
n_hidden = hidden_dim

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.lstm.weight_hh_l0.data.fill_(0)
x = 1
nn.init.uniform_(model.fc.weight, -x, x)
nn.init.uniform_(model.fc.bias, -x, x)
model = model.cuda()

hidden_dim is 100


In [44]:
#functions for training phase

def session_to_tensor(session, hotels_window, max_window):
  tensor = torch.zeros(len(session), 1, n_features)
  
  for ai, action in enumerate(session):
    tensor[ai][0] = hotel_to_tensor(action['reference'], hotel_dict, n_features_w2vec, hotels_window, max_window)
  return tensor

def sessions_to_batch(session_list, hotel_dict, max_session_len, batch_size, n_features, n_features_w2vec, hotels_window, max_window): #modified

  tensor = torch.zeros(max_session_len, batch_size, n_features)
  
  for si, session in enumerate(session_list):
    for ai, action in enumerate(session):
      tensor[ai][si] = hotel_to_tensor(action['reference'], hotel_dict, n_features_w2vec, hotels_window, max_window)
  return tensor

def hotel_to_tensor(hotel, hotel_dict, n_features_w2vec, hotels_window, max_window):
  tensor_w2vec = torch.zeros(n_features_w2vec)
  tensor_window = torch.zeros(max_window)
  tensor_prices = torch.zeros(n_features_prices)
  
  if hotel in hotel_dict: #-----------int
    tensor_w2vec = torch.from_numpy(hotel_dict[hotel])
  
  if max_window != 0:
    if hotel in hotels_window:
      tensor_window[hotels_window.index(hotel)] = 1
    
  if hotel in price_dict:
    tensor_prices[price_dict.index(hotel)] = 1
      
  tensor = torch.cat((tensor_w2vec, tensor_window), 0)
  tensor = torch.cat((tensor, tensor_prices), 0)
  
  return tensor

def hotel_to_category(hotel, hotel_dict, n_features):
  tensor = torch.zeros(1)

  if hotel in hotel_dict.index2word:
    tensor = torch.tensor([hotel_dict.index2word.index(hotel)], dtype=torch.long)

  
  return tensor

def hotels_to_category_batch(hotel_list, hotel_dict, n_hotels, batch_size): #modified
  tensor = torch.zeros(batch_size)
  for hi, hotel in enumerate(hotel_list):
    if hotel in hotel_dict.index2word:
      tensor[hi] = torch.tensor([hotel_dict.index2word.index(hotel)], dtype=torch.long)
  return tensor

In [45]:
def category_from_output(output):
  top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
  category_i = int(top_i[0][0])
  #print(output)
  return hotel_dict.index2word[category_i], category_i
  
  
def list_to_space_string(l):
  """Return a space separated string from a list"""
  s = " ".join(l)
  return s

In [46]:
loss_fn = torch.nn.CrossEntropyLoss().cuda()
#loss_fn = torch.nn.NLLLoss().cuda()

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train(category_tensor, line_tensor):
    hidden = torch.zeros(1, 1, n_hidden)
    c = torch.zeros(1, 1, n_hidden)
    
    optimizer.zero_grad()
    
    line_tensor = line_tensor.requires_grad_()
    line_tensor = line_tensor.cuda()
    

    output = model(line_tensor)
    
    category_tensor = category_tensor.long().cuda()

    loss = loss_fn(output, category_tensor)
    loss.backward()

    optimizer.step()
    
    return output, loss.item()

In [None]:
import time
import math

#distance = nn.PairwiseDistance(p=2., eps=1e-6)

num_epochs = 20

n_iters = len(sessions) * num_epochs
print_every = 100
plot_every = 1


# Keep track of losses for plotting
current_loss = 0
all_losses = []
all_acc = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for epoch in range(1, num_epochs + 1):
  model.train()
  iter = 0
  
  #print('epoch ' + str(epoch))
  print(str(len(sessions)) + ' sessions to be computed')
  
  for index, session in enumerate(sessions):
    iter = iter + 1

    #ession_tensor = session_to_tensor(session, hotels_window[index], max_window)
    #ategory = categories[index]
    #ategory_tensor = hotel_to_category(category, hotel_dict, n_hotels)

    if batchsize == 0:
        session_tensor = session_to_tensor(session, hotel_dict, n_features, n_features_w2vec, hotels_window[index], max_window)  
        category = categories[index]
        category_tensor = hotel_to_category(category, hotel_dict, n_hotels)
    else:
        max_session_len = 0
        for si, single_session in enumerate(session):
            if len(single_session) > max_session_len:
                max_session_len = len(single_session)
        session_tensor = sessions_to_batch(session, hotel_dict, max_session_len, batchsize, n_features, n_features_w2vec, hotels_window[index], max_window)
        category = categories[index]
        category_tensor = hotels_to_category_batch(category, hotel_dict, n_hotels, batchsize)
    
    output, loss = train(category_tensor, session_tensor)

    current_loss += loss
      
    if iter % print_every == 0:

        guess, guess_i = category_from_output(output)

        correct = '✓' if guess == category else '✗ (%s)' % category
        print('(%s) %.4f %s / %s %s' % (timeSince(start), loss, session[0][0]['session_id'], guess[0], correct[0]))

        
  # Add current loss avg to list of losses
  if epoch % plot_every == 0:
      all_losses.append(current_loss / (plot_every * len(sessions)))
      print('Epoch: ' + str(epoch) + ' Loss: ' + str(current_loss / (plot_every * len(sessions))))
      print('%d %d%% (%s)' % (epoch, epoch / num_epochs * 100, timeSince(start)))
      #acc = test_accuracy(model, df_test, df_gt)
      acc = test_accuracy_optimized(model, df_test, df_gt, test_sessions, test_hotels_window, test_clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list)
      print("Score: " + str(acc))
      all_acc.append(acc)
      current_loss = 0

1223 sessions to be computed
(0m 5s) 5.2984 1240c5a86d97d / 9 ✗
(0m 11s) 5.2155 29e0656479fb7 / 1 ✗
(0m 16s) 4.2959 3e8ae94bb6054 / 2 ✗
(0m 20s) 6.4385 543f99f78fcf8 / 2 ✗
(0m 25s) 6.3694 68b8d95ceffb6 / 1 ✗
(0m 30s) 7.5852 7ce8698867bf8 / 3 ✗
(0m 35s) 6.6763 91b3799d0a9e4 / 1 ✗
(0m 39s) 7.3770 a64f9baca46be / 1 ✗
(0m 44s) 7.4391 b9c47e9a8ed5f / 7 ✗
(0m 49s) 8.4268 cf0c96d88d2ef / 8 ✗
(0m 53s) 9.2293 e397dacb6ce87 / 3 ✗


In [None]:
df_test

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

In [None]:
plt.figure()
plt.plot(all_acc)

In [None]:
test_hotels_window[5]

In [None]:
test_sessions[5]

In [None]:
df_test[df_test['session_id'] == '2423aea8cde50']

In [None]:
df_meta[df_meta['item_id'] == '2793224']

In [None]:
df_gt[df_gt['session_id'] == '2423aea8cde50']

In [None]:
df_train[df_train['reference'] == '2793224']

In [None]:
evaluate_debug(test_sessions[5],hotel_dict, n_features, test_hotels_window[5], max_window)

In [None]:
evaluate_debug_false(test_sessions[5],hotel_dict, n_features, test_hotels_window[5], max_window)

In [None]:
acc = test_accuracy_optimized(model, df_test, df_gt, test_sessions, test_hotels_window, test_clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list)

In [None]:
acc

In [None]:
acc = test_accuracy_optimized_false(model, df_test, df_gt, test_sessions, test_hotels_window, test_clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list)

In [None]:
acc

In [None]:
all_losses

In [None]:
df_train

In [None]:
df_test[df_test['item_recommendations'] != np.nan]

In [None]:
#mrr = score_submissions_no_csv(df_sub, df_gt, get_reciprocal_ranks)
mrr = acc = test_accuracy_optimized(model, df_test, df_gt, test_sessions, test_hotels_window, test_clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list)
print("End execution with score " + str(mrr))

In [None]:
df_temp = df_test[df_test['user_id'] == '68Q297NAT23H']

In [None]:
df_temp

In [None]:
df_temp = df_temp.drop(df_temp[df_temp['step'] == 3].index)

In [None]:
test_accuracy(df_temp, df_gt)

In [None]:
session = []
hotel_window = []
for action_i, action in df_temp.iterrows():
  if (action['action_type'] == 'clickout item') & math.isnan(float(action['reference'])):
    hotel_window = action['impressions'].split('|')
  session.append(action)

In [None]:
#importing training
link_train = 'https://drive.google.com/open?id=1zCpgAT-RGtMYDnhv8KHRsFc8NONKz55r'
fluff, id_train = link_train.split('=')
downloaded = drive.CreateFile({'id':id_train}) 
downloaded.GetContentFile('train_100.csv')

df_train_100 = pd.read_csv("./train_100.csv")

In [None]:
df_train_100 = remove_single_actions(df_train_100)
df_train_100 =  remove_nonitem_actions(df_train_100)

In [None]:
#importing test set
link_test = 'https://drive.google.com/open?id=1bIWb7rWQecLuyZKW0YP4yDuehD4zgVDg'
fluff, id_test = link_test.split('=')
downloaded = drive.CreateFile({'id':id_test}) 
downloaded.GetContentFile('test_100.csv')

df_test_100 = pd.read_csv("./test_100.csv")

In [None]:
df_test_100 = remove_single_actions(df_test_100)
df_test_100 = remove_nonitem_actions(df_test_100)

In [None]:
#importing meta
link_test = 'https://drive.google.com/open?id=1Qzu75vrXcfB0SjbtcKJzVKoZJht6vUHT'
fluff, id_test = link_test.split('=')
downloaded = drive.CreateFile({'id':id_test}) 
downloaded.GetContentFile('item_meta.csv')

df_meta = pd.read_csv("./item_meta.csv")

In [None]:
df_test[df_test['action_type'] == 'clickout item']

In [None]:
df_test_off[df_test_off['user_id'] == 'ZVTSO44R1US2']

In [None]:
action = df_test_off[(df_test_off['user_id'] == 'ZVTSO44R1US2') & (df_test_off['step'] == 49)]
action

In [None]:
action['impressions'].values[0]

In [None]:
for item_id in action['impressions'].values[0].split():
  print(df_meta[df_meta['item_id'] == item_id])

In [None]:
df_meta[df_meta['item_id'] == '4775012']

In [None]:
hotels_ref = []
hotels_full = []
for action_index, action in df_train_100.iterrows():
  hotels_ref.append(action['reference'])
  hotels_full.append(action['reference'])
  if action['action_type'] == 'clickout item':
      hotels_full = hotels_full + action['impressions'].split('|')
  

In [None]:
hotels_ref = list(set(hotels_ref))
hotels_full = list(set(hotels_full))

In [None]:
len(hotels_ref)

In [None]:
len(hotels_full)

In [None]:
for action_index, action in df_test_100.iterrows():
  hotels_ref.append(action['reference'])
  hotels_full.append(action['reference'])
  if action['action_type'] == 'clickout item':
      hotels_full = hotels_full + action['impressions'].split('|')

TEST PRICES

In [None]:
#importing prices
link_test = 'https://drive.google.com/open?id=1Y0BtPk-Bp5tTYMMwl11fvjYeo6lqDqm0'
fluff, id_test = link_test.split('=')
downloaded = drive.CreateFile({'id':id_test}) 
downloaded.GetContentFile('item_price.csv')

df_prices = pd.read_csv("./item_price.csv")

In [None]:
def generate_prices_sparse_matrix(df, features_col='intervals'):
    df['present'] = 1
    hotel_dict = create_item_dict(df) #Controllare che sia uguale all'altro dizionario
    feature_dict = create_item_dict(df, col_name='feature')
    list_hotel = list(df['reference'])
    list_features = list(df['feature'])
    list_data = list(df['present'])
    n_items = len(list_hotel)
    n_features = len(list_features)
    # Convert each list of string in a list of indexes
    list_items = list(map(lambda x: hotel_dict[x], list_hotel))
    list_features = list(map(lambda x: feature_dict[x], list_features))
    # Generate the sparse matrix
    row = np.array(list_items)
    col = np.array(list_features)
    data = np.array(list_data)
    csr = csr_matrix((data, (row, col)), shape=(n_items, n_features))

    return csr, hotel_dict
  
def get_hotel_prices(df_metadata, n_categories = 2000):
    """
    Required Input -
        - metadata_file = file with the average price for each hotel
    """
    #print("Reading metadata: " + metadata_file)
    df_metadata['price'] = df_metadata['price'].apply(lambda x: math.log10(x))
    # Define the range
    max_price = df_metadata['price'].max()
    min_price = df_metadata['price'].min()
    range = (max_price - min_price) / n_categories
    # Generate the classes
    df_metadata['intervals'] = pd.cut(df_metadata['price'], bins=np.arange(min_price,max_price,range))
    df_metadata.loc[:, 'intervals'] = df_metadata['intervals'].apply(str)
    #classes_dic = create_user_dict(df_metadata, col_name = 'intervals')
    #df_metadata.loc[:, 'intervals'] = df_metadata['intervals'].apply(lambda x : classes_dic.get(x))
    #df_metadata.loc[:, 'intervals'] = df_metadata['intervals'].apply(int)
    # Create a dictionary of item_id -> price_category
    price_dic = pd.Series(df_metadata.intervals.values,index=df_metadata.impressions).to_dict()


    return price_dic

In [None]:
df_prices

In [None]:
price_dic = get_hotel_prices(df_prices, n_categories = 2000)

In [None]:
price_dic

In [None]:
len(price_dic)

In [None]:
len(list(set(price_dic.values())))

In [None]:
def test_accuracy_optimized(model, df_test, df_gt, sessions, hotels_window, clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list, subname="submission_default_name", isprint=False):
  """Return the score obtained by the net on the test dataframe"""

  test_dim = len(df_test)

  print_every = 500

  missed = 0
  
  missed_target = 0

  for session_index, session in enumerate(sessions):
    if clickout_index[session_index] != []:
      df_test.loc[clickout_index[session_index], 'item_recommendations'] = evaluate(session, hotel_dict, n_features, hotels_window[session_index], max_window)
      #print(df_gt[(df_gt['session_id'] == df_test.loc[clickout_index[session_index], 'session_id'].values[0]) & (df_gt['step'] == df_test.loc[clickout_index[session_index], 'step'].values[0])]['reference'].values[0])
      #target = df_gt[(df_gt['session_id'] == df_test.loc[clickout_index[session_index], 'session_id'].values[0]) & (df_gt['step'] == df_test.loc[clickout_index[session_index], 'step'].values[0])]['reference'].values[0]
      #print(type(target))
      #if int(target) not in hotel_dict: 
        #missed_target = missed_target + 1
        #print(session_index)
  df_sub = get_submission_target(df_test)

  #print(str(missed_target) + ' correct hotels were not in dictionary - total: ' + str(len(hotels_window)))
  
  #Removing unnecessary columns
  df_sub = df_sub[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]

  mask = df_sub["item_recommendations"].notnull()
  df_sub = df_sub[mask]

  # Saving df_sub
  if isprint:
      df_sub.to_csv('./' + subname + '.csv')

  mrr = score_submissions_no_csv(df_sub, df_gt, get_reciprocal_ranks)
  return mrr, df_sub

In [None]:
mrr, df_sub_rnn = test_accuracy_optimized(model, df_test, df_gt, test_sessions, test_hotels_window, test_clickout_index, hotel_dict, n_features, max_window, meta_dict, meta_list)

In [None]:
mrr

In [None]:
df_sub_rnn

In [None]:
test_clickout_index

In [None]:
test_accuracy(model, df_test, df_gt)

In [None]:
df_test['item_recommendations'] = np.nan

test_dim = len(df_test)
temp_session = []
hotels_window = []
i = 0
print_every = 500
step = 0
    
for action_index, action in df_test.iterrows():    
    if(action['reference'] != 'unknown'):
        if (action['action_type'] == 'clickout item') & math.isnan(float(action['reference'])):
            hotels_window = action['impressions'].split('|')

            if len(temp_session) != 0:
                df_test.loc[action_index, 'item_recommendations'] = list_to_space_string(action['impressions'].split('|'))

            temp_session.append(action)

        else:
            temp_session.append(action)

    if(i < test_dim-1):
        if action['session_id'] != df_test.iloc[[i + 1]]['session_id'].values[0]:
            step = 0
            #print(temp_session)
            #print(hotels_window)
            #print(p.r)
            temp_session = []
            hotels_window = []

    i = i+1  
    step = step + 1


df_sub = get_submission_target(df_test)

#Removing unnecessary columns
df_sub = df_sub[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]

for action_index, action in df_gt.iterrows():
    if action_index not in df_sub.index.values.tolist():
        df_gt = df_gt.drop(action_index)

mask = df_sub["item_recommendations"].notnull()
df_sub_impression = df_sub[mask]

mrr = score_submissions_no_csv(df_sub, df_gt, get_reciprocal_ranks)
print(mrr)

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import operator
import math

MERGE_COLS = ["user_id", "session_id", "timestamp", "step"]

def generate_rranks_range(start, end):
    """Generate reciprocal ranks for a given list length."""

    return 1.0 / (np.arange(start, end) + 1)

def read_into_df(file):
    """Read csv file into data frame."""
    df = (
        pd.read_csv(file)
            .set_index(['user_id', 'session_id', 'timestamp', 'step'])
    )

    return df
def score_submissions(subm_csv, gt_csv, objective_function):
    """Score submissions with given objective function."""

    print(f"Reading ground truth data {gt_csv} ...")
    df_gt = read_into_df(gt_csv)

    print(f"Reading submission data {subm_csv} ...")
    df_subm = read_into_df(subm_csv)
    print('Submissions')
    print(df_subm.head(10))
    # create dataframe containing the ground truth to target rows
    cols = ['reference', 'impressions', 'prices']
    df_key = df_gt.loc[:, cols]

    # append key to submission file
    df_subm_with_key = df_key.join(df_subm, how='inner')
    print(df_subm_with_key.head())
    df_subm_with_key.reference = df_subm_with_key.reference.astype(int)
    df_subm_with_key = convert_string_to_list(
        df_subm_with_key, 'item_recommendations', 'item_recommendations'
    )

    # score each row
    df_subm_with_key['score'] = df_subm_with_key.apply(objective_function, axis=1)
    df_subm_with_key.to_csv('borda.csv')
    print(df_subm_with_key)
    mrr = df_subm_with_key.score.mean()

    return mrr

def get_reciprocal_ranks(ps):
    """Calculate reciprocal ranks for recommendations."""
    mask = ps.reference == np.array(ps.item_recommendations)

    if mask.sum() == 1:
        rranks = generate_rranks_range(0, len(ps.item_recommendations))
        return np.array(rranks)[mask].min()
    else:
        return 0.0

def convert_string_to_list(df, col, new_col):
    """Convert column from string to list format."""
    fxn = lambda arr_string: [int(item) for item in str(arr_string).split(" ")]

    mask = ~(df[col].isnull())

    df[new_col] = df[col]
    df.loc[mask, new_col] = df[mask][col].map(fxn)

    return df

def calculate_single_list_score(l):
    """
        Input -> list of string
        Output -> Dictionary {'item': score}
    """
    score_dic = {}
    i = 0
    for rec in l:
        score_dic[rec] = len(l) - i
        i = i + 1
    return score_dic

def sum_and_sort_dictionaries(dic_1, dic_2):
    """
        Input -> 2 dictionaries
        Output -> 1 list of item sorted by score
    """
    sum_dic = dict(Counter(dic_1)+Counter(dic_2))
    sorted_x = sorted(sum_dic.items(), key=operator.itemgetter(1), reverse = True)
    sorted_items = list(map(lambda x:x[0], sorted_x))
    return sorted_items

def calculate_borda(mf_rec, rnn_rec):
    if(mf_rec == ''):
        return rnn_rec
    if(rnn_rec == ''):
        return mf_rec

    # Calculate score dictionary for mf
    mf_rec_dic = calculate_single_list_score(mf_rec.split(' '))
    rnn_rec_dic = calculate_single_list_score(rnn_rec.split(' '))
    list_items = sum_and_sort_dictionaries(mf_rec_dic, rnn_rec_dic)
    result = " ".join(list_items)
    return result

df_mf = df_sub_impression
df_rnn = df_sub_rnn
gt_file = 'gt.csv'
submission_file = 'submission_ensamble.csv'


df_merged = (
    df_mf
    .merge(df_rnn,suffixes=('_mf', '_rnn'),
           left_on=MERGE_COLS,
           right_on=MERGE_COLS,
           how="left")
    )
#print(df_merged)
df_merged = df_merged.fillna('')
#print(df_merged)
df_merged['item_recommendations'] = df_merged.apply(lambda x: calculate_borda(x.item_recommendations_mf, x.item_recommendations_rnn), axis=1)
df_merged = df_merged[MERGE_COLS + ['item_recommendations']]
df_merged.to_csv(submission_file)
mrr =score_submissions(submission_file, gt_file, get_reciprocal_ranks)
#print(df_merged.head())
print('Score: ' + str(mrr))