In [None]:
import os
import io
import json
import time
import math
import string 
import pickle
import datetime
import itertools
import numpy as np
import pandas as pd
from pprint import pprint 
from tqdm.notebook import tqdm
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
df_talks = pd.read_csv('../Data/talks_data.csv')

df_talks.head()

In [None]:
df_talks['text'] =  df_talks['transcript'] + ' ' + df_talks['description'] + ' ' + df_talks['title']

In [None]:
l = word_tokenize('hello darkness my old friend')
l 

In [None]:
# Dataset
docs = [s.translate(str.maketrans('', '', string.punctuation)).split() for s in tqdm(df_talks.text)]
docs = [[w.lower() for w in doc if w.lower() not in stop_words and len(w) > 3 and not w.endswith(':') and not w.endswith(')')] for doc in docs]

len(docs)

In [None]:
word_counter = Counter([w for d in docs for w in d])

In [None]:
most_frequent_words = set([w[0] for w in word_counter.most_common(512)])

In [None]:
docs_filtered = [[w for w in doc if w not in most_frequent_words] for doc in docs]

In [None]:
len(word_counter) * 0.01

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(docs_filtered)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs_filtered]

In [None]:
num_topics = 40
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=42,
                                           passes=50, # 30 is too good
                                           alpha='auto',
                                           eta='auto',
                                           per_word_topics=True)

In [None]:
for t in lda_model.show_topics(40, num_words=8):
    print('Topic', t[0], end=': ')
    for w in t[1].split(' + '):
        print(w.split('*')[1], end=', ')
    print()

In [None]:
topic_per_talk = lda_model.get_document_topics(corpus)

In [None]:
topic_per_talk[0]

In [None]:
cutoff_perc = 0.03

with open('../kg_embeddings/metadata-interactions-tm/ted-tm-min'+str(cutoff_perc)+'_n'+str(num_topics)+'.txt', 'w') as f:

    for i, talk in enumerate(df_talks['id'].values):
        talk_topics = topic_per_talk[i]
        for topic, perc in talk_topics:
            if perc >= cutoff_perc:
                f.write(f'{talk}\thasTopic\ttopic_{topic}\n')

In [None]:
cutoff_perc = 0.3

with open('../kg_embeddings/metadata-interactions-tm/ted-tm-min'+str(cutoff_perc)+'_n'+str(num_topics)+'.txt', 'w') as f:

    for i, talk in enumerate(df_talks['id'].values):
        talk_topics = topic_per_talk[i]
        for topic, perc in talk_topics:
            if perc >= cutoff_perc:
                f.write(f'{talk}\thasTopic\ttopic_{topic}\n')

In [None]:
for cutoff in ['0.3', '0.03']:
    ! cat ../kg_embeddings/metadata-interactions-tm/ted-all-data-test.txt ../kg_embeddings/metadata-interactions-tm/ted-tm-min{cutoff}_n{num_topics}.txt > ../kg_embeddings/metadata-interactions-tm/ted-all-tm-{cutoff}-test.txt
    ! cat ../kg_embeddings/metadata-interactions-tm/ted-all-data-train.txt ../kg_embeddings/metadata-interactions-tm/ted-tm-min{cutoff}_n{num_topics}.txt > ../kg_embeddings/metadata-interactions-tm/ted-all-tm-{cutoff}-train.txt
    ! cat ../kg_embeddings/metadata-interactions-tm/ted-all-data-valid.txt ../kg_embeddings/metadata-interactions-tm/ted-tm-min{cutoff}_n{num_topics}.txt > ../kg_embeddings/metadata-interactions-tm/ted-all-tm-{cutoff}-valid.txt