In [None]:
import numpy as np
import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')

# Dataset


In [None]:
import pickle
df = pickle.load(open('data/imdb/imdb_reviews_1000film.df','rb'))

In [None]:
cols = {0:'actor', 1:'actor', 2:'actor',3:'color',4:'rating',5:'director',6:'genre',7:'language',8:'company',9:'country',10:'release date',11:'year'}

In [None]:
import datetime
import csv

movies_dic = {}
with open('data/imdb/imdb_movielens.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    for row in csv_reader:
        if row[12].replace('_',' ') in movies_dic: continue
        movies_dic[row[12].replace('_',' ')] = [r.replace('_',' ') for r in row[0:10]]
        month,year = '',''
        if len(row[10]) > 0:        
            month = datetime.date(1900, int(row[10][4::]), 1).strftime('%B')
            year = row[10][0:4]
        
        movies_dic[row[12].replace('_',' ')].append(month.lower() + ' ' + year)
        movies_dic[row[12].replace('_',' ')].append(int(float(row[14])))

In [None]:
import networkx as nx
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
import nltk
ps = nltk.stem.PorterStemmer()

def return_n_grams(text,k):
    tokens = word_tokenize(text)
    n_grams = set()
    for i in range(0,len(tokens)-(k-1)):
        n_grams.add( ' '.join( ( [tk for tk in tokens[i:i+k]]) ))
        
    return n_grams


def find_all_n_grams (text,n):
    n_grams = []
    for k in range(1,n+1):
        k_grams = return_n_grams(text,k)
        for g in k_grams: n_grams.append(g)
    return n_grams


import re
def normalize_text(text):
    text = re.sub(r'#+', ' ', text )
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
    #text = re.sub(r'[0-9]+', '', text)
    text = re.sub('\W', ' ', text)
    #text = re.sub(r'\d+', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\'s", ' ', text)
    #text = re.sub('[^A-Za-z]+', ' ', text)
    text = re.sub(r'\b\w\b', ' ', text)
    text = text.strip()
    text = re.sub('\s+', ' ', text).strip()
    text = re.sub('\n+', ' ', text).strip()
    text = re.sub('\t+', ' ', text).strip()
    
    
    return text.lower()



In [None]:
from tqdm import tqdm

G=nx.Graph()
K = 3

i = 0
nodes_labels = {}
row_ids = {}
id_rows = {}

for movie in tqdm(movies_dic, position=0):
    i+=1
    row_name = str('RW'+str(i))
    G.add_node(row_name , label= row_name, type='Row')
    row_ids[row_name] = movie
    id_rows[movie] = row_name
    j=0
    for cl in movies_dic[movie]:
        j+=1
        col_name = str('CL'+str(j))
        if cl == '': continue
        if not G.has_node(col_name):     G.add_node(col_name , label= col_name, type='Column')
        n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(str(cl),K)]
        for tg in n_grams:
            G.add_node(tg,label=tg, type='Token')
            G.add_edge(row_name,tg)
            G.add_edge(col_name,tg)
            
            
i = 0
review_ids = {}
id_review = {}

for row in tqdm(df.itertuples()):
    if row.movie.lower() not in movies_dic: continue
    i += 1
    text = remove_stopwords(normalize_text(row.user_review.lower()))
    review_name = str('Review'+str(i))
    G.add_node(review_name , label= review_name, type='Review')
    review_ids[review_name] = row.user_review
    id_review[text] = review_name
    
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(text,K)]

    for tg in n_grams:
        if not G.has_node(tg):            continue
        if not G.has_edge(review_name,tg):            G.add_edge(review_name,tg)
    

In [None]:
ground_truth = {}
for row in tqdm(df.itertuples()):
    if row.movie.lower() not in movies_dic:         continue
    movie_name = row.movie.lower() 
    if movie_name not in ground_truth: ground_truth[movie_name] = []
    ground_truth[movie_name]. append(id_review[remove_stopwords(normalize_text(row.user_review.lower()))])

# Method0: BM25

In [None]:
from gensim.summarization.bm25 import get_bm25_weights
from gensim.summarization.bm25 import BM25

In [None]:
corpus = [rev.split() for rev in id_review]
results = BM25(corpus)
review_full = {}
for r in id_review:    review_full[r] = r

In [None]:
import numpy as np
movie_review_BM25 = {}
for movie in tqdm(ground_truth):
    m_id = id_rows[movie]
    text = ''
    try:
        text += ' '.join(movies_dic[movie])
    except:
        for tt in movies_dic[movie]:         text += ' '.join([str(t) for t in tt])

    #text += ' ' + movie 
    scores = results.get_scores((text).split())
    arr = np.array(scores)
    topK = arr.argsort()[::-1]
    movie_review_BM25[m_id] = [(id_review[review_full[' '.join(corpus[idx])]],scores[idx]) for idx in topK]

In [None]:
for KK in [1,2,3,5,10,20,5000000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MRR, hasP = 0,0,0

    for movie in movie_review_BM25:
        #if row_ids[movie] not in ground_truth: continue
        i+=1
        preds = [f for (f,j) in movie_review_BM25[movie]][0:KK]
        golds = [f for f in ground_truth[row_ids[movie]]]

        MAP += utils.MAP_K(golds,preds)
        MRR += utils.MRR(golds,preds)
        hasP += utils.HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MRR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# Method1: Unsupervised SentenceBERT

In [None]:
########################## METRICS

def HAS_POSITIVE(actual,preds):
    for i in range(0,len(preds)):
        if preds[i] in actual:
            return 1
    return 0

def MRR(actual,preds):
    for i in range(0,len(preds)):
        if preds[i] in actual:
            return 1/(i+1)
    return 0


def MAP_K(actual,preds):
    precision = 0
    hit = 0
    for i in range(0,len(preds)):
        if preds[i] in actual:
            hit += 1
            precision += hit/(i+1)
    return precision/len(actual)    


In [None]:
from gensim.parsing.preprocessing import remove_stopwords

vocabs = set()
for node in G.nodes():
    if G.nodes()[node]['type'] == 'Token':
        if len(node.split('_')) == 1: vocabs.add(node)

def return_filtered(text):
    text = remove_stopwords(normalize_text(text))
    t = ''
    for token in word_tokenize(text):
        if token in vocabs: t += token + ' '
    return t   

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [None]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
review_embs_f = model.encode([return_filtered(rev) for rev in id_review],show_progress_bar=True)

In [None]:
special_tokens_dict = {'additional_special_tokens': ['[COL]','[VAL]']}

word_embedding_model = model._first_module()   #Your models.Transformer object
word_embedding_model.tokenizer.add_special_tokens(special_tokens_dict)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

In [None]:
import time
st = time.time()
pred_reviews = {}


for movie in tqdm(ground_truth,position=0):
  text = '[COL] title [VAL] ' + movie + ' '
  for c in range(0,len(movies_dic[movie])):
    text += ' [COL] ' + str(cols[c]) + ' [VAL] ' + str(movies_dic[movie][c])

#  text = ' '.join([str(m) for m in movies_dic[movie]])
#  text += ' ' + movie

  m_emb = model.encode(return_filtered(text))
  
  temp = []
  for rv in range(0,len(review_ids)):
    temp.append(([r for r in review_ids.keys()][rv],cosine_similarity(m_emb.reshape(1, -1),review_embs_f[rv].reshape(1, -1))[0][0]))
  pred_reviews[movie] = sorted(temp,key=lambda dist:dist[1],reverse=True)
  
#print(time.time()-st)

In [None]:
for KK in [1,2,3,5,10,20,50,5000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for movie in pred_reviews:
        if movie not in ground_truth: continue
        i+=1
        preds = [f for (f,j) in pred_reviews[movie]][0:KK]
        golds = [f for f in ground_truth[movie]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# Method2: Supervised SentenceBERT

In [None]:
from sentence_transformers import SentenceTransformer
SBmodel = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [None]:
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

In [None]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
st = time.time()
movie_review_score = []
review_embs = model.encode([rev for rev in id_review])
for movie in tqdm(ground_truth,position=0):

  text = '[COL] title [VAL] ' + movie + ' '
  for c in range(0,len(movies_dic[movie])):
    text += ' [COL] ' + str(cols[c]) + ' [VAL] ' + str(movies_dic[movie][c])
#  text = ' '.join([str(m) for m in movies_dic[movie]])
#  text += ' ' + movie
    
  m_emb = SBmodel.encode(text)
    
  for review in range(0,len(id_review)):
    temp = []
    review_emb = review_embs[review]
    temp.append(cosine_similarity(m_emb.reshape(1, -1),review_embs[review].reshape(1, -1))[0][0])
      
    if id_review[[r for r in id_review.keys()][review]] in ground_truth[movie]: temp.append(1)
    else: temp.append(0)
    movie_review_score.append(temp)

  time.time()-st

In [None]:
import numpy as np

dataset = np.array(movie_review_score)
X = dataset[:,0:1]
y = dataset[:,1]

In [None]:
import keras
from keras import losses,optimizers

model = Sequential()
model.add(Dense(20, input_dim=1, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['accuracy'])

In [None]:
st = time.time()
class_weight = {0: 1.,1: 50.}

model.fit(X, y, epochs=150, batch_size=2048,class_weight=class_weight)
print(time.time()-st)

In [None]:
movie_predictions = {}
st = time.time()

for movie in tqdm(ground_truth,position=0):
    

  text = '[COL] title [VAL] ' + movie + ' '
  for c in range(0,len(movies_dic[movie])):
    text += ' [COL] ' + str(cols[c]) + ' [VAL] ' + str(movies_dic[movie][c])

  #text = ' '.join([str(m) for m in movies_dic[movie]])
  #text += ' ' + movie
    
  m_emb = SBmodel.encode(text)

  seen = []
  data,scores = [],[]

  for review in range(0,len(id_review)):
    seen.append(id_review[[r for r in id_review.keys()][review]])
    data.append(cosine_similarity(m_emb.reshape(1, -1),review_embs[review].reshape(1, -1))[0][0])

  res = model.predict(np.array(data))
        
  for i in range(0,len(res)):
      scores.append((seen[i],res[i][0]))
        
  movie_predictions[movie] = sorted(scores, key=lambda dist: dist[1],reverse = True)

In [None]:
for KK in [1,2,3,5,10,20,50,50000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for movie in movie_predictions:
        if movie not in ground_truth: continue
        i+=1
        preds = [f for (f,j) in movie_predictions[movie]][0:KK]
        golds = [f for f in ground_truth[movie]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# Method3: Supervised SentenceBERT - n_sentence

In [None]:
from sentence_transformers import SentenceTransformer
SBmodel = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [None]:
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

In [None]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
review_embeds = {}
review_embeds_sent = {}
for rv in tqdm(id_review,position=0):
    review_embeds [rv] = SBmodel.encode(rv)
    review_embeds_sent [rv] = []
    
    sents = nltk.tokenize.sent_tokenize(rv)
    for s in sents:
        review_embeds_sent[rv].append(SBmodel.encode(s))


In [None]:
K = 5
movie_review_score = []
for movie in tqdm(ground_truth,position=0):
  text = '[COL] title [VAL] ' + movie + ' '
  for c in range(0,len(movies_dic[movie])):
    text += ' [COL] ' + str(cols[c]) + ' [VAL] ' + str(movies_dic[movie][c])

  #text = ' '.join([str(m) for m in movies_dic[movie]])
  #text += ' ' + movie
    
  m_emb = SBmodel.encode(text)
    
  for review in id_review:
    temp = []
    for sent in review_embeds_sent[review]:
        temp.append(cosine_similarity(m_emb.reshape(1, -1),sent.reshape(1, -1))[0][0])
    temp = sorted(temp,reverse=True)[0:3]
    temp.append(cosine_similarity(m_emb.reshape(1, -1),review_embeds[review].reshape(1, -1))[0][0])
    
    while len(temp) < K:
        temp.append(0)
        
    if id_review[review] in ground_truth[movie]: temp.append(1)
    else: temp.append(0)
        
    movie_review_score.append(temp)



In [None]:
import numpy as np

dataset = np.array(movie_review_score)
X = dataset[:,0:4]
y = dataset[:,5]

In [None]:
import keras
from keras import losses,optimizers

model = Sequential()
model.add(Dense(20, input_dim=4, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['accuracy'])

In [None]:
st = time.time()
class_weight = {0: 1.,1: 50.}

model.fit(X, y, epochs=150, batch_size=2048,class_weight=class_weight)
print(time.time()-st)

In [None]:
movie_predictions = {}

movie_review_score = []
for movie in tqdm(ground_truth,position=0):
  text = '[COL] title [VAL] ' + movie + ' '
  for c in range(0,len(movies_dic[movie])):
    text += ' [COL] ' + str(cols[c]) + ' [VAL] ' + str(movies_dic[movie][c])

  #text = ' '.join([str(m) for m in movies_dic[movie]])
  #text += ' ' + movie
    
  m_emb = SBmodel.encode(text)

  seen = []
  data,scores = [],[]

  for review in id_review:
    seen.append(id_review[review])

    temp = []
    for sent in review_embeds_sent[review]:
        temp.append(cosine_similarity(m_emb.reshape(1, -1),sent.reshape(1, -1))[0][0])
    temp = sorted(temp,reverse=True)[0:3]
    temp.append(cosine_similarity(m_emb.reshape(1, -1),review_embeds[review].reshape(1, -1))[0][0])
    
    while len(temp) < K-1:
        temp.append(0)

    
    data.append(temp)

  res = model.predict(np.array(data))
        
  for i in range(0,len(res)):
      scores.append((seen[i],res[i][0]))
        
  movie_predictions[movie] = sorted(scores, key=lambda dist: dist[1],reverse = True)

In [None]:
for KK in [1,2,3,5,10,20,50,50000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for movie in movie_predictions:
        if movie not in ground_truth: continue
        i+=1
        preds = [f for (f,j) in movie_predictions[movie]][0:KK]
        golds = [f for f in ground_truth[movie]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# Method4: Reranking

In [None]:
import dlib

In [None]:
movie_predictions_SB = movie_predictions
movie_review_BM25 = movie_review_BM25

In [None]:
data = dlib.ranking_pair()

In [None]:
st = time.time()
for movie in tqdm(movies_dic,position=0):
    if movie not in movie_predictions_SB or id_rows[movie] not in movie_review_BM25: continue

    m_BM = [i for (i,j) in movie_review_BM25[ id_rows[movie]]]
    m_SB = [i for (i,j) in movie_predictions_SB[movie]]
    
    
    for r in review_ids:
        if r in ground_truth[movie]: 
            data.relevant.append(dlib.vector([m_BM.index(r)+1, m_SB.index(r)+1]))
        else:  data.nonrelevant.append(dlib.vector([m_BM.index(r)+1, m_SB.index(r)+1]))

trainer = dlib.svm_rank_trainer()
trainer.c = 1000

rank = trainer.train(data)
print(time.time()-st)

In [None]:
st = time.time()
i=0
rerank_imdb = {}

for movie in tqdm(movies_dic,position=0):
    if movie not in movie_predictions or id_rows[movie] not in movie_review_BM25: continue
    i+=1

    m_BM = [i for (i,j) in movie_review_BM25[ id_rows[movie]]]
    m_SB = [i for (i,j) in movie_predictions_SB[movie]]

    temp = []
    for r in review_ids:
        temp.append((r,rank(dlib.vector([m_BM.index(r)+1, m_SB.index(r)+1]))))
    temp = sorted(temp, key=lambda dist: dist[1],reverse = True)
    rerank_imdb[movie] = temp
print((time.time()-st)/i)

In [None]:
for KK in [1,2,3,5,10,20,50,50000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for movie in rerank_imdb:
        if movie not in ground_truth: continue
        i+=1
        preds = [f for (f,j) in rerank_imdb[movie]][0:KK]
        golds = [f for f in ground_truth[movie]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# Method5: Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
data = []
for movie in movies_dic:
    text = ' '.join(movies_dic[movie])
    text += ' ' + movie
    data.append(text)

for review in id_review:    data.append((review))
    
tagged_data = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
%env PYTHONHASHSEED=0
max_epochs = 10
vec_size = 300

model = Doc2Vec(size=vec_size, min_count=10, dm =0, workers=1, window=4,seed=0, epochs=max_epochs

model.build_vocab(tagged_data)

model.train(tagged_data, total_examples=model.corpus_count,epochs=model.epochs)


print("Model Saved")

In [None]:
import numpy as np
movie_review_d2v = {}
for movie in tqdm([g for g in ground_truth]):
    m_id = id_rows[movie]
    text = ' '.join(movies_dic[movie])

    text += ' ' + movie

    movie_review_d2v[m_id] = utils.cosine_distance(model,text,[utils.normalize_text(return_filtered(r)) 
                                                               for r in id_review],500)

In [None]:
for KK in [1,2,3,5,10,20,50,500]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MRR, hasP = 0,0,0

    for movie in movie_review_d2v:
        if row_ids[movie] not in ground_truth: continue
        
        i+=1
        preds = [id_review[review_full[f]] for (f,j) in movie_review_d2v[movie]][0:KK]
        golds = [f for f in ground_truth[row_ids[movie]]]

        MAP += utils.MAP_K(golds,preds)
        MRR += utils.MRR(golds,preds)
        hasP += utils.HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MRR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)