In [None]:
import os
import io
import json
import time
import math
import string 
import pickle
import datetime
import itertools
import numpy as np
import pandas as pd
from pprint import pprint 
from tqdm.notebook import tqdm
from collections import Counter

import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix as sparse_matrix

from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LinearRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import RegexpTokenizer, word_tokenize  

from gensim.test import utils
from gensim.models import KeyedVectors, nmf
from gensim.corpora.dictionary import Dictionary
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sentence_transformers import models, SentenceTransformer

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [None]:
df_talks = pd.read_csv('../Data/talks_data.csv')
df_users = pd.read_csv('../users_data.csv')

In [None]:
df_talks.head(2)

In [None]:
df_users.head(2)

In [None]:
df_train = pd.read_csv('../Data/TED_train.csv')
df_test  = pd.read_csv('../Data/TED_test.csv')
talks_ids = {k:i for i,k in pickle.load(open('../Data/dict_talks_idx.pickle', 'rb')).items()}
users_ids = {k:i for i,k in pickle.load(open('../Data/dict_users_idx.pickle', 'rb')).items()}

df_train['user'] = df_train['user_id'].apply(lambda u: users_ids[u])
df_train['talk'] = df_train['talk_id'].apply(lambda u: talks_ids[u])
df_test['user'] = df_test['user_id'].apply(lambda u: users_ids[u])
df_test['talk'] = df_test['talk_id'].apply(lambda u: talks_ids[u])

In [None]:
datasets = [d for d in os.listdir('/data/ted_kg_embeddings') if not d.endswith('.sh')]
datasets

In [None]:
representations = { 'transd':{} }

In [None]:
for rep in representations:
    representations[rep] = {}
    for dataset in tqdm(datasets):
        try:
            if os.path.exists('/data/ted_kg_embeddings/'+dataset+'/embeddings/'+rep+'/ent_embedding.tsv'):
                df = pd.read_csv('/data/ted_kg_embeddings/'+dataset+'/embeddings/'+rep+'/ent_embedding.tsv', sep='\t', header=None)
                df_labels = pd.read_csv('/data/ted_kg_embeddings/'+dataset+'/embeddings/'+rep+'/ent_labels.tsv', sep='\t', header=None)

                df_data = pd.DataFrame(np.hstack([df_labels.values, df.values]))

                transe_representations = {v[0]: v[1:] for v in df_data.values}
                embeddings = []

                for talk_hash in df_talks.id.values:
                    embeddings.append(transe_representations[talk_hash])

                representations[rep][dataset] = np.array(embeddings)
                print(rep, representations[rep][dataset].shape)
            else:
                print('Embeddings not computed yet for', dataset)
        except Exception as e:
            print('Problem with', rep, '-', dataset, ':', str(e))

In [None]:
len(representations['transd'])

In [None]:
talk2idx = {v:i for i,v in enumerate(df_talks.id)}
idx2talk = {i:v for v,i in talk2idx.items()}
title2idx = {v:i for i,v in enumerate(df_talks.title)}
idx2title = {i:v for v,i in title2idx.items()}
title2hash = {t:h for t,h in df_talks[['title', 'id']].values}
hash2title = {i:v for v,i in title2hash.items()}

In [None]:
talks_likes = {}
less_than_two = []
for talk_idx, e in df_train.groupby('talk'):
    talks_likes[talk_idx] = e['user'].values.tolist()
    if len(e['user'].values.tolist()) < 2:
        less_than_two.append(talk_idx)
len([l for l in talks_likes if len(talks_likes[l]) != ''])

In [None]:
for talk_idx, e in df_test.groupby('talk'):
    if talk_idx in talks_likes:
        talks_likes[talk_idx].extend(e['user'].values.tolist())
    else:
        talks_likes[talk_idx] = e['user'].values.tolist()

    if len(talks_likes[talk_idx]) < 2:
        less_than_two.append(talk_idx)
len([l for l in talks_likes if len(talks_likes[l]) != ''])

In [None]:
related_talks = {}
for t, rv in df_talks[['title', 'related_videos']].values:
    try:
        related_talks[t] = rv.split(';')
    except:
        print(t)

# Dataset 2 Evaluation:

In [None]:
predictions = {}

for rep in representations:
    predictions[rep] = {}
    for dataset in tqdm(representations[rep]):
        talk_features = representations[rep][dataset]
        sim_matrix = cosine_similarity(talk_features, talk_features)
        predictions[rep][dataset] = {idx: sim_matrix[idx].argsort()[::-1]
                                        for idx in range(len(sim_matrix))}

In [None]:
metrics = {}
K = 10

for rep in tqdm(representations):
    metrics[rep] = {}
    for dataset in tqdm(representations[rep]):
        metrics[rep][dataset] = {'hitrate@10': [], 'mmr@10': [], 'ndcg@10': []}

        for talk_idx in range((len(sim_matrix))):
            talk_title = idx2title[talk_idx]
            if talk_title not in related_talks:
                continue

            try:
                user_recs = predictions[rep][dataset][talk_idx][1:] # Item 0 is the talk itself, sim == 1
                norm_hitrate = sum(1 for _ in related_talks[talk_title])
                norm_mmr     = sum(1/(r+1) for r, _ in enumerate(related_talks[talk_title]))
                norm_ndcg    = sum(math.log(2)/math.log(r+2) for r, _ in enumerate(related_talks[talk_title]))

                hitrate = 0
                mmr = 0
                ndcg = 0

                for rec_title in related_talks[talk_title]:
                    rec_idx = title2idx[rec_title]
                    rec_rank = np.where(user_recs==rec_idx)[0][0] + 1
                    # print(user_recs)

                    hitrate += 0 if rec_rank > K else 1
                    mmr     += 0 if rec_rank > K else 1/rec_rank
                    ndcg    += 0 if rec_rank > K else math.log(2)/math.log(1+rec_rank)

                metrics[rep][dataset]['hitrate@10'].append(hitrate/norm_hitrate)
                metrics[rep][dataset]['mmr@10'].append(mmr/norm_mmr)
                metrics[rep][dataset]['ndcg@10'].append(ndcg/norm_ndcg)
            except Exception as e:
                print(talk_idx, rep, str(e))

        metrics[rep][dataset]['hitrate@10'] = np.mean(metrics[rep][dataset]['hitrate@10'])
        metrics[rep][dataset]['mmr@10'] = np.mean(metrics[rep][dataset]['mmr@10'])
        metrics[rep][dataset]['ndcg@10'] = np.mean(metrics[rep][dataset]['ndcg@10'])

In [None]:
for rep in metrics:
    for dataset in metrics[rep]:
        print(rep.upper(), '-', dataset)
        for metric in metrics[rep][dataset]:
            print('    ' + metric.upper() + ':', round(metrics[rep][dataset][metric], 4))
        print('')

# Combining Representations

In [None]:
"""combo_sim = []
for rep in ['transe', 'transd', 'transh', 'transm', 'transr']:
    talk_features = representations[rep]
    if len(combo_sim) == 0:
        combo_sim = cosine_similarity(talk_features, talk_features)
    else:
        combo_sim += cosine_similarity(talk_features, talk_features)

combo_1 = {idx: combo_sim[idx].argsort()[::-1] for idx in range(len(sim_matrix))}

rep = 'combo_1'
metrics[rep] = {'hitrate@10': [], 'mmr@10': [], 'ndcg@10': []}

for talk_idx in range((len(sim_matrix))):
    talk_title = idx2title[talk_idx]
    if talk_title not in related_talks:
        continue

    try:
        user_recs = combo_1[talk_idx][1:] # Item 0 is the talk itself, sim == 1
        norm_hitrate = sum(1 for _ in related_talks[talk_title])
        norm_mmr     = sum(1/(r+1) for r, _ in enumerate(related_talks[talk_title]))
        norm_ndcg    = sum(math.log(2)/math.log(r+2) for r, _ in enumerate(related_talks[talk_title]))

        hitrate = 0
        mmr = 0
        ndcg = 0

        for rec_title in related_talks[talk_title]:
            rec_idx = title2idx[rec_title]
            rec_rank = np.where(user_recs==rec_idx)[0][0] + 1
            # print(user_recs)

            hitrate += 0 if rec_rank > K else 1
            mmr     += 0 if rec_rank > K else 1/rec_rank
            ndcg    += 0 if rec_rank > K else math.log(2)/math.log(1+rec_rank)

        metrics[rep]['hitrate@10'].append(hitrate/norm_hitrate)
        metrics[rep]['mmr@10'].append(mmr/norm_mmr)
        metrics[rep]['ndcg@10'].append(ndcg/norm_ndcg)
    except:
        print(talk_idx)

metrics[rep]['hitrate@10'] = np.mean(metrics[rep]['hitrate@10'])
metrics[rep]['mmr@10'] = np.mean(metrics[rep]['mmr@10'])
metrics[rep]['ndcg@10'] = np.mean(metrics[rep]['ndcg@10'])

print(rep.upper())
for metric in metrics[rep]:
    print('    ' + metric.upper() + ':', round(metrics[rep][metric], 4))
"""
print('')

# Dataset 1 Evaluation:

In [None]:
train_dataset = {}
test_dataset = {}

for user, e in tqdm(df_train.groupby('user')):
    train_dataset[user] = e['talk'].values.tolist()

for user, e in tqdm(df_test.groupby('user')):
    test_dataset[user] = e['talk'].values.tolist()
    

user2idx = {u:i for i,u in enumerate(sorted(test_dataset.keys()))}
idx2user = {i:u for u,i in user2idx.items()}

In [None]:
representations['transd']['metadata_ner_min10_none_none'].shape

In [None]:
user_vectors = {}
for rep in representations:
    user_vectors[rep] = {}
    for dataset in representations[rep]:
            try:
                features = representations[rep][dataset]
                user_vectors[rep][dataset] = []
                for user in tqdm(sorted(user2idx.keys()), desc=dataset + ' - ' + rep):
                    avg_features = np.mean([features[talk2idx[talk]] for talk in train_dataset[user]], axis=0)
                    user_vectors[rep][dataset].append(avg_features)
            except Exception as e:
                print('Problem with', rep, str(e))

In [None]:
similarities = {}
for rep in representations:
    similarities[rep] = {}
    for dataset in tqdm(representations[rep]):
        simmat = cosine_similarity(user_vectors[rep][dataset], representations[rep][dataset])
        similarities[rep][dataset] = {user: simmat[user2idx[user]].argsort()[::-1] for user in test_dataset}

In [None]:
metrics_1 = {}
for rep in representations:
    metrics_1[rep] = {}
    for dataset in tqdm(representations[rep]):
        metrics_1[rep][dataset] = {'hitrate@10': [], 'mmr@10': [], 'ndcg@10': []}
        for user in test_dataset:
            already_seen = [talk2idx[t] for t in train_dataset[user]]
            one_out      = talk2idx[test_dataset[user][0]]
            already_seen_idx = np.where(np.isin(similarities[rep][dataset], already_seen))
            user_recs    = np.delete(similarities[rep][dataset][user], already_seen_idx)
            one_out_rank = np.where(user_recs==one_out)[0][0] + 1

            metrics_1[rep][dataset]['hitrate@10'].append(int(one_out_rank <= K))
            metrics_1[rep][dataset]['mmr@10'].append(0 if one_out_rank > K else 1/one_out_rank)
            metrics_1[rep][dataset]['ndcg@10'].append(0 if one_out_rank > K else math.log(2)/math.log(1+one_out_rank))

        metrics_1[rep][dataset]['hitrate@10'] = np.mean(metrics_1[rep][dataset]['hitrate@10'])
        metrics_1[rep][dataset]['mmr@10'] = np.mean(metrics_1[rep][dataset]['mmr@10'])
        metrics_1[rep][dataset]['ndcg@10'] = np.mean(metrics_1[rep][dataset]['ndcg@10'])

In [None]:
for rep in metrics_1:
    for dataset in metrics_1[rep]:
        print(rep.upper(), '-', dataset) 
        for metric in metrics_1[rep][dataset]:
            print('    ' + metric.upper() + ':', round(metrics_1[rep][dataset][metric], 4))
        print('')

In [None]:
"""combo = {}

combo_sim = []
for rep in [ 'transd']:
    talk_features = representations[rep]
    user_features = user_vectors[rep]
    if len(combo_sim) == 0:
        combo_sim = cosine_similarity(user_features, talk_features)
    else:
        combo_sim += cosine_similarity(user_features, talk_features)

combo_1 = {user: combo_sim[user2idx[user]].argsort()[::-1] for user in test_dataset}

rep = 'combo_2'
metrics[rep] = {'hitrate@10': [], 'mmr@10': [], 'ndcg@10': []}

for user in test_dataset:
    # user_idx  = user2idx[user]
    already_seen = [talk2idx[t] for t in train_dataset[user]]
    already_seen_idx = np.where(np.isin(combo_1[user], already_seen))
    user_recs = np.delete(combo_1[user], already_seen_idx)
    one_out   = talk2idx[test_dataset[user][0]]
    one_out_rank = np.where(user_recs==one_out)[0][0] + 1

    metrics[rep]['hitrate@10'].append(int(one_out_rank <= K))
    metrics[rep]['mmr@10'].append(0 if one_out_rank > K else 1/one_out_rank)
    metrics[rep]['ndcg@10'].append(0 if one_out_rank > K else math.log(2)/math.log(1+one_out_rank))

metrics[rep]['hitrate@10'] = np.mean(metrics[rep]['hitrate@10'])
metrics[rep]['mmr@10'] = np.mean(metrics[rep]['mmr@10'])
metrics[rep]['ndcg@10'] = np.mean(metrics[rep]['ndcg@10'])

print(rep.upper())
for metric in metrics[rep]:
    print('    ' + metric.upper() + ':', round(metrics[rep][metric], 4))
"""
print('')