In [None]:
# coding: utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from multiprocessing import Pool
import pickle, gc, warnings, os, time
get_current_time = lambda: time.strftime('%Y-%m-%d %X', time.localtime())
warnings.filterwarnings('ignore')
import itertools
from argparse import ArgumentParser
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline
sns.set(color_codes=True, rc={'figure.figsize':(8,6)}, 
        font_scale=1.25, style='whitegrid', font='DejaVu Serif')

In [None]:
# Read the MovieLens rating data
train = pd.read_csv("../input/ratings.dat", sep="::", header=None, 
                    names=["UserID", "MovieID", "Rating", "Timestamp"], 
                    usecols=["UserID", "MovieID", "Rating"])
train.to_feather("../input/ratings.feather")
train.head()

In [None]:
# Read the user data
user_feature = pd.read_csv("../input/users.dat", sep="::", 
                           names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],
                           usecols=["UserID", "Gender", "Age", "Occupation"])
for col in ["Gender", "Age"]:
    user_feature[col] = user_feature[col].astype("category").cat.codes
user_feature.to_feather("../input/users.feather")

In [None]:
# Read the movie data
movie_feature = pd.read_csv("../input/movies.dat", sep="::",
                            names=["MovieID", "Title", "Genres"])
# Pre-processing
movie_feature["Year"] = movie_feature["Title"].apply(lambda t: int(t[-5:-1]))
movie_feature["Year"] = movie_feature["Year"]-movie_feature["Year"].min()+1

import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=20, split="|")
tokenizer.fit_on_texts(movie_feature["Genres"])
movie_feature["Genres"] = tokenizer.texts_to_sequences(movie_feature["Genres"])

movie_feature["Title"] = movie_feature["Title"].apply(lambda t: t[:-7])
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(movie_feature["Title"])
movie_feature["Title"] = tokenizer.texts_to_sequences(movie_feature["Title"])

movie_feature.head()

In [None]:
# binarize the target column
train['y'] = (train.Rating>=4).astype('int8')
train = train[['UserID', 'MovieID', 'y']]
# Merge the training data
train = train.merge(user_feature, how='left')
train = train.merge(movie_feature, how='left')
print(train.shape)
train.head()

In [None]:
cnts = train.MovieID.value_counts()
aid_counts = pd.DataFrame(cnts).reset_index()
aid_counts.columns = ["MovieID", 'counts']
aid_counts.reset_index(inplace=True)
aids = train[["MovieID"]].copy()
aids = aids.merge(aid_counts, how='left')
aids['prop'] = aids['index']/aids['index'].max()
print("The total number of ad IDs: {}".format(len(aid_counts)))

sns.distplot(aids['prop'], kde=False, bins=20);
plt.xlabel('proportion of movies');
plt.ylabel('number of samples');

In [None]:
# split the ad IDs
# thresholds: 80 and 300
# ads with at least 300 samples are "big" ones for pre-training the base model
# ads with at least 80 samples are "small" ones for cold-start and warm-up
# ads with less than 80 samples are no use for this experiment 
#     because we cannot build a testing dataset for them
small_aids = cnts[(cnts<300) & (cnts>=80)].index.tolist()
big_aids = cnts[cnts>=300].index.tolist()
print("The number of small ads:", len(small_aids))
print("The number of big ads:", len(big_aids))

In [None]:
# split the dataset
small_train = train[train.MovieID.isin(small_aids)]
big_train = train[train.MovieID.isin(big_aids)]
print("Shape of the small dataset:\t{}".format(small_train.shape))
print("Shape of the big dataset:\t{}".format(big_train.shape))
big_train.reset_index(drop=True, inplace=True)
small_train.reset_index(drop=True, inplace=True)

In [None]:
import pickle

with open('../ME_data/big_train_main.pkl', 'wb') as f:
    pickle.dump(big_train, f)
with open('../ME_data/small_train_main.pkl', 'wb') as f:
    pickle.dump(small_train, f)

In [None]:
# pre-processing: get the few-shot batchs
train_IDs = big_aids
train_IDs_ind = {ID: big_train[big_train.MovieID==ID].index for ID in train_IDs}
test_IDs = small_aids
test_IDs_ind = {ID: small_train[small_train.MovieID==ID].index for ID in test_IDs}

minibatchsize = 20
train_oneshot_inds_a = {ID: train_IDs_ind[ID][:minibatchsize] for ID in train_IDs}
train_oneshot_inds_a = np.concatenate(list(train_oneshot_inds_a.values()))
train_oneshot_inds_b = {ID: train_IDs_ind[ID][minibatchsize:minibatchsize*2] for ID in train_IDs}
train_oneshot_inds_b = np.concatenate(list(train_oneshot_inds_b.values()))
train_oneshot_inds_c = {ID: train_IDs_ind[ID][2*minibatchsize:3*minibatchsize] for ID in train_IDs}
train_oneshot_inds_c = np.concatenate(list(train_oneshot_inds_c.values()))
train_oneshot_inds_d = {ID: train_IDs_ind[ID][3*minibatchsize:4*minibatchsize] for ID in train_IDs}
train_oneshot_inds_d = np.concatenate(list(train_oneshot_inds_d.values()))
test_oneshot_inds_a = {ID: test_IDs_ind[ID][:minibatchsize] for ID in test_IDs}
test_oneshot_inds_a = np.concatenate(list(test_oneshot_inds_a.values()))
test_oneshot_inds_b = {ID: test_IDs_ind[ID][minibatchsize:minibatchsize*2] for ID in test_IDs}
test_oneshot_inds_b = np.concatenate(list(test_oneshot_inds_b.values()))
test_oneshot_inds_c = {ID: test_IDs_ind[ID][minibatchsize*2:minibatchsize*3] for ID in test_IDs}
test_oneshot_inds_c = np.concatenate(list(test_oneshot_inds_c.values()))
test_test_inds = {ID: test_IDs_ind[ID][minibatchsize*3:] for ID in test_IDs}
test_test_inds = np.concatenate(list(test_test_inds.values()))

train_oneshot_a = big_train.iloc[train_oneshot_inds_a]
train_oneshot_b = big_train.iloc[train_oneshot_inds_b]
train_oneshot_c = big_train.iloc[train_oneshot_inds_c]
train_oneshot_d = big_train.iloc[train_oneshot_inds_d]
test_oneshot_a = small_train.iloc[test_oneshot_inds_a]
test_oneshot_b = small_train.iloc[test_oneshot_inds_b]
test_oneshot_c = small_train.iloc[test_oneshot_inds_c]
test_test = small_train.iloc[test_test_inds]

In [None]:
print("shape of the 'real' testing dataset:", test_test.shape)

In [None]:
with open("../ME_data/train_oneshot_a.pkl", "wb") as f:
    pickle.dump(train_oneshot_a, f)
with open("../ME_data/train_oneshot_b.pkl", "wb") as f:
    pickle.dump(train_oneshot_b, f)
with open("../ME_data/train_oneshot_c.pkl", "wb") as f:
    pickle.dump(train_oneshot_c, f)
with open("../ME_data/train_oneshot_d.pkl", "wb") as f:
    pickle.dump(train_oneshot_d, f)
with open("../ME_data/test_oneshot_a.pkl", "wb") as f:
    pickle.dump(test_oneshot_a, f)
with open("../ME_data/test_oneshot_b.pkl", "wb") as f:
    pickle.dump(test_oneshot_a, f)
with open("../ME_data/test_oneshot_c.pkl", "wb") as f:
    pickle.dump(test_oneshot_a, f)
with open("../ME_data/test_test.pkl", "wb") as f:
    pickle.dump(test_test, f)