### Data Preprocessing

In [61]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
import itertools
from collections import Counter

from MovieClass import MovieClass, MultiMovieClass
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
#data selection
df = pd.read_csv('data_raw/ratings_small.csv')
df = df.loc[df.rating>=4.0]
#get number of movies watched by each userId
df_n = df['userId'].value_counts().to_frame()
df_n.columns = ['ntotal']
df_n['userId'] = df_n.index
# print(df_n) eliminate the data <21
df = df.merge(df_n, on = 'userId')
df = df.loc[df.ntotal>=30]

#sort by rating and timestap
df = df.sort_values(['userId','rating'],ascending=[1,0])
df = df.sort_values(['userId','timestamp'],ascending=[1,1])

df = df[['userId', 'movieId']]
df['count'] = df.groupby('userId').cumcount()
# df.to_csv('data/users_data.csv')
df

Unnamed: 0,userId,movieId,count
9,2,150,0
33,2,590,1
34,2,592,2
17,2,296,3
10,2,153,4
...,...,...,...
51501,671,1035,79
51565,671,5995,80
51541,671,3271,81
51567,671,6365,82


In [63]:
#split and save inputs and truth ground 
df_X = df[df['count']<20]
df_Y = df[df['count']>=20]
df_X = df_X[['userId', 'movieId']]
df_Y = df_Y[['userId', 'movieId']]

# df_X.to_csv('data/users_history.csv')
# df_Y.to_csv('data/ground_truth.csv')

In [64]:
df_X

Unnamed: 0,userId,movieId
9,2,150
33,2,590
34,2,592
17,2,296
10,2,153
...,...,...
51513,671,1265
51494,671,551
51491,671,356
51527,671,2396


# Step-1: recall 

### "Movie2Movie" Co-visitation Matrix - Rating Weighted

In [13]:
df = pd.read_csv('data_raw/ratings_small.csv').drop('timestamp', axis=1)
df = pd.merge(df, df, on ='userId')
df = df.loc[df.movieId_x != df.movieId_y]
df['wgt'] = df.rating_y / 5
df = df[['movieId_x','movieId_y','wgt']]
df = df.groupby(['movieId_x','movieId_y']).wgt.sum()
df = df.reset_index()
df = df.sort_values(['movieId_x','wgt'], ascending=[True, False])
df = df.reset_index(drop = True)
df['n'] = df.groupby('movieId_x').movieId_y.cumcount()
df = df.loc[df.n <20].drop('n', axis=1)
df.to_csv('data/Movie2Movie.csv')

### Most Popular Movies

In [63]:
#from full dataset
df = pd.read_csv('data_raw/ratings.csv')
top_view_movies = df.movieId.value_counts().index.values[:50]
top_view_movies

array([  356,   318,   296,   593,  2571,   260,   480,   527,   110,
           1,  1210,   589,  1196,  2959,  1198,    50,  2858,   150,
         780,   858,  4993,   457,  1270,   592,    32,    47,   608,
        5952,  7153,   590,  2028,   588,   380,  2762,   377,  3578,
         364,   344,   648,  4306,  1721,  1580,   595,  1197,  4226,
         165,  1240,  1193,  1291, 58559])

In [64]:
df = df.movieId.value_counts().to_frame()
df = df.reset_index()
df.columns = ['movieId','ratingNum']
df['populairty'] = df['ratingNum'].apply(lambda x: int(x/9000))
df_pop = df[['movieId','populairty']]
df_pop.to_csv('data/popularity.csv')
df_pop.head()

Unnamed: 0,movieId,populairty
0,356,10
1,318,10
2,296,9
3,593,9
4,2571,8


### Same director and cast 

In [53]:
# df3 = pd.read_csv('data_raw/credits.csv')
# directors = []
# casts = []
# for i in range(len(df3)):
#     for member in eval(df3.iloc[i].values[1]):
#         directors.append(member['id'])
    
#     list = []
#     for member in eval(df3.iloc[i].values[0]):
#         list.append(member['id'])   
#     list = list[:10] if len(list) > 10 else list+[-1]*(10-len(list))
#     casts.append(list)

# df3['director'] = pd.DataFrame(directors)
# df_cast = pd.DataFrame(casts)
# df3 = pd.concat([df3, df_cast], axis=1)
# df3.rename(columns={'id': 'tmdbid', 0: 'cast1',1: 'cast2',2: 'cast3',3: 'cast4',4: 'cast5',5: 'cast6',6: 'cast7',7: 'cast8',8: 'cast9',9: 'cast10'}, inplace=True)
# df3 = df3[['tmdbid','director', 'cast1', 'cast2', 'cast3', 'cast4', 'cast5', 'cast6', 'cast7', 'cast8', 'cast9', 'cast10']]
# df3.to_csv('data/movie_director_casts.csv')
# df3.head()

Unnamed: 0,tmdbid,director,cast1,cast2,cast3,cast4,cast5,cast6,cast7,cast8,cast9,cast10
0,862,7879,31,12898,7167,12899,12900,7907,8873,1116442,12901,12133
1,8844,12891,2157,8537,205,145151,5149,10739,58563,1276,46530,56523
2,15602,7,6837,3151,13567,16757,589,16523,7166,-1,-1,-1
3,31357,12892,8851,9780,18284,51359,66804,352,87118,34,1276777,10814
4,11862,12893,67773,3092,519,70696,59222,18793,14592,20906,54348,209


# Step-2: Rerank

### word2vec

In [142]:
#read keywords file
df1 = pd.read_csv('data_raw/keywords.csv')

wv_train_data = []
for i in range(len(df1)):
    key_list = []
    if len(df1.iloc[i].values):
        for key in eval(df1.iloc[i].values[1]):
            key_list.append(key['name'])
    if key_list:
        wv_train_data.append(key_list) 

model = Word2Vec(sentences=wv_train_data, vector_size=32, min_count=0, workers=5, window=5, epochs=30)
model.save("word2vec.model")

In [90]:
cosine_similarity(model.wv['man'].reshape(1,-1), model.wv['vampire'].reshape(1,-1))

array([[0.3885254]], dtype=float32)

### load data

### Bulid features

feature1: rank of movie2movie, replace None by mean  

feature2: similarity of user and the movie  

feature3: user vector covarience  

feature4: movie popularity 



In [49]:
def df_covisitation_to_dict(df):
    return df.groupby('movieId_x').movieId_y.apply(list).to_dict()

def df_user_to_dict(df):
    return df.groupby('userId').movieId.apply(list).to_dict()


df_cov = pd.read_csv('data/Movie2Movie.csv')
df_X = pd.read_csv('data/users_history.csv')
df_y = pd.read_csv('data/ground_truth.csv')
df_dc = pd.read_csv('data/movie_director_casts.csv')
df_pop = pd.read_csv('data/popularity.csv')
top_20_movies = df_covisitation_to_dict(df_cov)
X_dic = df_user_to_dict(df_X)
gt_dic = df_user_to_dict(df_y)
model_wv = Word2Vec.load("word2vec_movie.model")

In [50]:
def get_label(df_recall, df_y):
    groudtruth = df_y.groupby('userId').movieId.apply(list).to_dict()
    label = []
    for i in range(len(df_recall)):
        movieid = int(df_recall.iloc[i].movieId)
        userid = int(df_recall.iloc[i].userId)
        if movieid in groudtruth[userid]:
            label.append([0,1])
        else:
            label.append([1,0])
    return np.array(label)



df_recall = pd.read_csv('data/dataset_feature.csv')
df_x = df_recall.iloc[:, 2:]
label = get_label(df_recall, df_y)

x_train = df_x[:27447].values[:,1:]
x_test = df_x[27447:].values[:,1:]
y_train = label[:27447]
y_test = label[27447:]

SVC

In [40]:
from sklearn.svm import SVC

model = SVC(C=0.1, gamma = 0.001)
model.fit(x_train, np.argmax(np.array(y_train), axis=1))
preditction = model.predict(x_train)
preditction[0:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
np.argmax(np.array(y_train), axis=1)[0:100]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0])

MLP

In [60]:
import torch
import torch.nn as nn
import torch.utils.data as Data

class MLP_3layer(nn.Module):
    def __init__(self, n_input, n_hidden1, n_hidden2):  # Define layers in the constructor
        super().__init__()
        self.fc1 = nn.Linear(n_input, n_hidden1)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(n_hidden1, n_hidden2)
        self.fc3 = nn.Linear(n_hidden2, 2)

    def forward(self, x):  # Define forward pass in the forward method
        x = x.view(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class MLP_16layer(nn.Module):
    def __init__(self, n_input):  # Define layers in the constructor
        super().__init__()
        self.fc1 = nn.Linear(n_input, 50)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, 30)
        self.fc4 = nn.Linear(30, 20)
        self.fc5 = nn.Linear(20, 10)
        self.fc6 = nn.Linear(10, 2)

    def forward(self, x):  # Define forward pass in the forward method
        x = x.view(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.relu(self.fc5(x))
        x = self.fc6(x)
        return x

def train(model, epoch_num, trainloader, optimizer, loss_func):
    device = torch.device("cpu")
    model.train()
    model.to(device)
    train_loss_all = []
    for epoch in range(epoch_num):
        train_loss = 0
        train_num = 0
        for step, (x, y) in enumerate(trainloader):
            x, y = x.to(device), y.to(device)# Move batch to device
            optimizer.zero_grad()
            output = model(x)
#             print(y.shape, output.shape)
            loss = loss_func(output, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * x.size(0)
            train_num += x.size(0)
        train_loss_all.append(train_loss / train_num)
        if epoch % 10 == 0:
            print(f'epoch:{epoch}, loss:{train_loss / train_num}')

def test(model, testloader, loss_func):
    device = torch.device("cpu")
    model.eval()  # set model to evaluation mode
    running_loss = 0
    num = 0
    with torch.no_grad():  # no need to compute gradients for testing
        for step, (x, y) in enumerate(testloader):
            x, y = x.to(device), y.to(device)
            output = model(x)
            # if step == 1:
            #     print(output[:10], y[:10])
            loss = loss_func(output, y)  # Compute loss
            running_loss += loss.item() * x.size(0)
            num += x.size(0)
    return running_loss / num

def predict(model, testloader, loss_func):
    device = torch.device("cpu")
    model.eval()  # set model to evaluation mode
    prediction = []
    with torch.no_grad():  # no need to compute gradients for testing
        for step, (x, y) in enumerate(testloader):
            x, y = x.to(device), y.to(device)
            output = model(x).cpu().numpy().tolist()
            prediction += output

    return prediction


# x_train, x_test = torch.from_numpy(x_train.astype(np.float32)), torch.from_numpy(x_test.astype(np.float32))
# y_train, y_test = torch.from_numpy(y_train.astype(np.float32)), torch.from_numpy(y_test.astype(np.float32))
train_data = Data.TensorDataset(x_train, y_train)
test_data = Data.TensorDataset(x_test, y_test)

train_loader = Data.DataLoader(dataset=train_data, batch_size=64)
test_loader = Data.DataLoader(dataset=test_data, batch_size=64)

model = MLP_6layer(97)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()
train(model, 100, train_loader, optimizer, criterion)
prediction = predict(model, test_loader, criterion)

epoch:0, loss:0.7028158435939843
epoch:10, loss:0.5569036628599136
epoch:20, loss:0.5527573629589938
epoch:30, loss:0.5492094877913084
epoch:40, loss:0.5456992537461852
epoch:50, loss:0.542209793434476
epoch:60, loss:0.5388248878589079
epoch:70, loss:0.5355566365799539
epoch:80, loss:0.5323120706975318
epoch:90, loss:0.5289868608882059


In [58]:
prediction[0:100]

[[1.0922129154205322, -1.092637062072754],
 [1.092210292816162, -1.092635989189148],
 [1.0922008752822876, -1.0926252603530884],
 [1.0922061204910278, -1.092631220817566],
 [1.0922120809555054, -1.0926367044448853],
 [1.0922142267227173, -1.0926374197006226],
 [1.0922132730484009, -1.0926371812820435],
 [1.0922112464904785, -1.0926340818405151],
 [1.092215657234192, -1.0926376581192017],
 [1.0922147035598755, -1.0926387310028076],
 [1.0922081470489502, -1.0926337242126465],
 [1.0922138690948486, -1.092637300491333],
 [1.0922142267227173, -1.0926387310028076],
 [1.0922129154205322, -1.0926376581192017],
 [1.0922130346298218, -1.0926382541656494],
 [1.0922168493270874, -1.0926395654678345],
 [1.0922162532806396, -1.0926378965377808],
 [1.0922144651412964, -1.0926374197006226],
 [1.092215657234192, -1.0926384925842285],
 [1.0922150611877441, -1.0926387310028076],
 [1.092215895652771, -1.0926399230957031],
 [1.0922092199325562, -1.0926344394683838],
 [1.0922129154205322, -1.092635750770568

In [55]:
y_train[0:100]

tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1