In [None]:
#import transformers
import os
import shutil
from dotenv import load_dotenv
load_dotenv()
import re
import string
import json
import time
import openai
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
import seaborn as sns

import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stoplist=stopwords.words('english')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer= WordNetLemmatizer()
from nltk.corpus import wordnet

#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

import gensim
from gensim.corpora import Dictionary
from collections import Counter
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models import Word2Vec
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath

## Importing gensim related libraries

In [None]:
#path = '../../datasets/test_dataset_model_pipeline/future_statements.csv'
path = '../stage_1_warc_dl/warc_dl_output/future_statements.csv'

df_orig = pd.read_csv(path, sep='|', error_bad_lines=False)
df_orig.columns

In [None]:
def get_tags(tag):
   """
   This function is used to get the part-of-speech(POS) for lemmatization
   """
   if tag.startswith('N') or tag.startswith('J'):
      return wordnet.NOUN
   #elif tag.startswith('J'):
   #   return wordnet.ADJ
   elif tag.startswith('V'):
      return wordnet.VERB
   elif tag.startswith('R'):
      return wordnet.ADV
   else:
      return wordnet.NOUN #default case

def preprocess(text):
   """
   1. Removes Punctuations
   2. Removes words smaller than 3 letters
   3. Converts into lowercase
   4. Lemmatizes words
   5. Removes Stopwords
   """
   punctuation = list(string.punctuation)
   doc_tokens = nltk.word_tokenize(text)
   word_tokens = [word.lower() for word in doc_tokens if not (word in punctuation or len(word)<=3)]
   # Lemmatize
   _pos_tags = nltk.pos_tag(word_tokens)
   pos_tags = []
   for i in _pos_tags:
      if re.search(r'(N)\w+',i[1]):
         pos_tags.append(i)
   doc_words = [wordnet_lemmatizer.lemmatize(word, pos = get_tags(tag)) for word, tag in pos_tags]
   doc_words = [word for word in doc_words if word not in stoplist]
   return doc_words

In [None]:
df_clean = df_orig['statement'].apply(preprocess)
docs= list(df_clean)
phrases = gensim.models.Phrases(docs, min_count=10, threshold=20)
bigram_model = gensim.models.phrases.Phraser(phrases)

In [None]:
def make_bigrams(texts):
    '''
    create bigrams from statements
    '''
    return [bigram_model[doc] for doc in texts]

In [None]:
# Form Bigrams
data_words_bigrams = make_bigrams(docs)
# Checkout most frequent bigrams
bigram_counter1 = Counter()
for key in phrases.vocab.keys():
    if key not in stopwords.words('english'):
        if len(str(key).split('_'))>1:
            bigram_counter1[key]+=phrases.vocab[key]
#present most common bigrams
for key, counts in bigram_counter1.most_common(15):
    print(key,"->", counts)

In [None]:
def w2vmodel_create(bigram_model, docs, stoplist):
    '''
    modelstep: Feeding the bigrams into a Word2Vec model produces more meaningful bigrams
    '''
    w2vmodel = Word2Vec(sentences=bigram_model[docs], vector_size=100, sg=1, hs= 1)
    bigram_counter = Counter()

    for key in w2vmodel.wv.key_to_index.keys():
        if key not in stoplist:
            if len(str(key).split("_")) > 1:
                bigram_counter[key] += w2vmodel.wv.get_vecattr(key, "count")
    return w2vmodel, bigram_counter

In [None]:
w2vmodel, bigram_counter = w2vmodel_create(bigram_model, docs, stoplist)

In [None]:
# get n most common bigrams
for key, counts in bigram_counter.most_common(15):
    print(key,"-> -> " ,counts)

In [None]:
def create_dict_and_corpus(data_words_bigrams, docs):
    '''
    Create a dictionary and corpus for input to our LDA model
    Filter out the most common and uncommon words
    '''
    dictionary = Dictionary(data_words_bigrams)
    print('Number of unique tokens (before filter): %d' % len(dictionary))

    # Filter out words that occur less than x documents, or more than y% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.6)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    print('Number of unique tokens (after filter): %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))

    return dictionary, corpus

In [None]:
dictionary, corpus = create_dict_and_corpus(data_words_bigrams, docs)

In [None]:
def create_ldamodel(dictionary, num_topics, passes, ldamodel_file, load_model):
    '''
    create or load LDA model/topic model
    '''
    if load_model and os.path.exists(os.path.join(ldamodel_file, 'lda')):
        print('Load model...')
        ldamodel = LdaModel.load(os.path.join(ldamodel_file, 'lda'))
    else:
        print('Create model...')
        if os.path.exists(ldamodel_file):
            shutil.rmtree(ldamodel_file)
        os.mkdir(ldamodel_file)
        t0 = time.time()
        ldamodel = LdaMulticore(corpus,
                                id2word=dictionary,
                                num_topics=num_topics,
                                alpha='asymmetric',
                                chunksize= 4000,
                                batch= True,
                                minimum_probability=0.001,
                                iterations=350,
                                passes=passes
                                )
        ldamodel.save(os.path.join(ldamodel_file, 'lda'))
        t1= time.time()
        print("...time for",passes," passes: ",(t1-t0)," seconds")
    return ldamodel

In [None]:
# params for lda multicore
num_topics = 10
passes = 150 # Number of passes through the corpus during training.
np.random.seed(1)

ldamodel_file = "ldamodel"
ldamodel = create_ldamodel(dictionary, num_topics, passes, ldamodel_file, False)

In [None]:
def topics_subtopics_dict():
    '''
    create clean topic dictionaries with topic as key and subtopics as values
    '''
    d_topics = {}
    d_topics_clean = {}
    for i in range(0, num_topics):
        d_topics[i] = ldamodel.show_topics(num_words=20, formatted=False)[i][1]
    #for i in d:
    #[item[0] for item in second_topic]
    for key in d_topics:
        list(d_topics.values())[key]
        d_topics_clean[key] = [item[0] for item in list(d_topics.values())[key]]
    return d_topics, d_topics_clean

In [None]:
d_topics = {}
d_topics_clean = {}
d_topics, d_topics_clean = topics_subtopics_dict()

In [None]:
def set_major_lda_topic(ldamodel, corpus, df_orig):
    '''
    set main topic for all reviews
    '''
    all_topics = ldamodel.get_document_topics(corpus)
    num_docs = len(all_topics)

    all_topics_csr = gensim.matutils.corpus2csc(all_topics)
    all_topics_numpy = all_topics_csr.T.toarray()

    major_topic = [np.argmax(arr) for arr in all_topics_numpy]
    df_orig['major_lda_topic'] = major_topic
    return df_orig['major_lda_topic']

def plot_topics_dist(df_orig):
    '''
    plot distribution of topics in statements
    '''
    sns.set(rc= {'figure.figsize': (20,5)})
    sns.set_style('darkgrid')
    df_orig['major_lda_topic'].value_counts().plot(kind='bar')

In [None]:
df_orig['major_lda_topic'] = set_major_lda_topic(ldamodel, corpus, df_orig)
plot_topics_dist(df_orig)

In [None]:
def create_keywords_openai(d_topics_clean):
    '''
    create keywords to use in openai prompt
    '''
    keywords_openai = {}
    for key in d_topics_clean:
        #keywords_string = ', '.join(sorted(list(d_topics_clean.values())[key]))
        keywords_string = ''
        for index, item in enumerate(list(d_topics_clean.values())[key]):
            if index < len(list(d_topics_clean.values())[key])-1:
                keywords_string = keywords_string + item + ', '
            else:
                keywords_string = keywords_string + item
        keywords_openai[key] = keywords_string
    return keywords_openai

In [None]:
keywords_openai = {}
keywords_openai = create_keywords_openai(d_topics_clean)

In [None]:
def intersection(lst1, lst2):
    '''
    calculate amount of intersection
    '''
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def eval_intersections(d_topics_clean):
    '''
    evaluate intersections between topic clusters
    '''
    for key_one in range(0, (len(d_topics_clean)-1)):
        for key_two in range(key_one+1, len(d_topics_clean)):
            inter = intersection(sorted(d_topics_clean[key_one]), sorted(d_topics_clean[key_two]))
            print('Intersect of topic %d [%d] and topic %d [%d]: %d'%(key_one, len(d_topics_clean[key_one]), (key_two), len(d_topics_clean[key_two]), len(inter)))
            print('-> %s'%inter)

In [None]:
eval_intersections(d_topics_clean)

In [None]:
def topic_name_generator(keywords_openai, d_topics_clean, oai_api_key):
    '''
    generate topic names with openai's text-davinci-002, based on the subtopics that are clustered by lda
    '''
    openai.api_key = oai_api_key

    d_topics_subtopics = {}

    for key in keywords_openai:
        response = openai.Completion.create(
            model = "text-davinci-002",
            prompt = "One topic of a maximum of two words for the following keywords without using the keywords: %s"%list(keywords_openai.values())[key],
            # prompt = """Extract categories from this list:
            # %s
            # """%list(keywords_openai.values())[key],
            #prompt = "Best matching category of a maximum of two words for the following keywords %s"%list(keywords_openai.values())[key],
            #prompt = "One best matching headline of a maximum of two words like 'word1 word2' for the following keywords %s"%list(keywords_openai.values())[key],
            temperature=0.3,
            max_tokens=256,
            top_p=1.0,
            frequency_penalty=0.8,
            presence_penalty=0.0,
            stop=["\"\"\""]
        )
        response_loaded = response['choices'][0]['text']
        response_list = []

        remove_pattern_1 = r'[\d].'
        remove_pattern_2 = r'\n'

        response_loaded = re.sub(remove_pattern_1, ',', response_loaded)
        response_loaded = re.sub(remove_pattern_2, '', response_loaded)
        response_loaded = response_loaded.split(', ')
        response_loaded = list(filter(None, response_loaded))

        for i in response_loaded:
            print('--', i)
        print('(%s)'%list(keywords_openai.values())[key])
        print('-------------')
        print('-------------')
        d_topics_subtopics[response_loaded[0]] = list(d_topics_clean.values())[key]
    return d_topics_subtopics

In [None]:
oai_api_key = os.getenv("OPENAI_API_KEY") #add openai_api_key here or use environment variable
d_topics_subtopics = topic_name_generator(keywords_openai, d_topics_clean, oai_api_key)

In [None]:
def save_subtopics_as_list(d_topics_subtopics):
    d_topics_subtopics_merged = []
    for i in range(0,len(d_topics_subtopics)):
        d_topics_subtopics_merged += list(d_topics_subtopics.values())[i]

    with open(r'subtopics.txt', 'w') as fp:
        fp.write(','.join(d_topics_subtopics_merged))

In [None]:
save_subtopics_as_list(d_topics_subtopics)