# Exc 2

Inspired from
https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [1]:
import functions_and_variables as fs
import os
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import nltk
from nltk import word_tokenize
from nltk.stem.porter import *
from nltk.corpus import stopwords

from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [2]:
def concat_categories_to_file(file_path, file_name, export_path):
    data = pd.read_csv("{}/{}.txt".format(file_path,file_name),sep=';')
    tweets = data.iloc[:, 0]
    categories = data.iloc[:, 1]
    
    category_tweets = {}
    for category, tweet in zip(categories, tweets):
        if category not in category_tweets:
            category_tweets[category] = []
        category_tweets[category].append(tweet)

    result = pd.DataFrame(category_tweets.items(),columns=['Category', 'Concatenated_Tweets'])
    result.to_csv('{}/{}.csv'.format(export_path,file_name), index=False, encoding='utf-8')
    print('Dataframes [key: sentences] for {} were successfully created and stored as .csv'.format(file_name))
    return category_tweets

for file in ['test','val','train','complete']:
    if file == 'complete':
        dataframe_categories = concat_categories_to_file('data',file,'categories')
    else:
        concat_categories_to_file('data',file,'categories')

dataframe_categories = dict(sorted(dataframe_categories.items()))
categories = dataframe_categories.keys()

Dataframes [key: sentences] for test were successfully created and stored as .csv
Dataframes [key: sentences] for val were successfully created and stored as .csv
Dataframes [key: sentences] for train were successfully created and stored as .csv
Dataframes [key: sentences] for complete were successfully created and stored as .csv


In [3]:
# Perform Stemming and remove Stopwords
def preprocess(sentences: list):
    cleaned = []
    
    for sentence in sentences: 
        stop_words = list(set(stopwords.words('english')))
        stop_words.extend(['im', 'ive','dont','cant'])
        stemmer = PorterStemmer()
    
        word_tokens = word_tokenize(sentence)
        cleaned.extend([stemmer.stem(w) for w in word_tokens if not w.lower() in stop_words])
    
    return cleaned

# Returns the words of a sentence
def sent_to_words(sentences: list):
        return [word_tokenize(sentence) for sentence in sentences]

In [4]:
def perform_lda(category: str, data: list, shoud_preprocess: bool, num_topis: int, num_words: int):
    
    data_words = []

    if not shoud_preprocess:
        data_words = list(sent_to_words(data))

    else:
        data_words.append(preprocess(data))
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    
    # Create Corpus
    texts = data_words
    
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,
                                           num_topics=num_topics)

    # Print the Keyword with the specified number of words
    print(category + ":")
    keywords = lda_model.print_topics(num_words=num_words)
    pprint(keywords)
    file_path = fs.result_path + 'lda/lda_ana_' + category + '.txt'
    fs.write_to_file(file_path=file_path, 
                     content=str(keywords), new=True)
    print('File for "' + category +'" successfully created at: ' + file_path)
    return id2word, corpus, lda_model

In [5]:
num_topics = 3
num_words = 4
lda_dict = dict.fromkeys(dataframe_categories.keys(), [])
pyLDAvis.enable_notebook()

index = 0
for category in categories:    
    # Perform LDA for the current topic
    lda_dict[category] = perform_lda(category=category, 
                                             data=dataframe_categories[category], shoud_preprocess=False, 
                                             num_topis=num_topics, num_words=num_words)

anger:
[(0, '0.055*"i" + 0.036*"the" + 0.031*"feel" + 0.029*"to"'),
 (1, '0.094*"i" + 0.032*"feel" + 0.029*"and" + 0.024*"to"'),
 (2, '0.053*"i" + 0.028*"feel" + 0.022*"to" + 0.021*"a"')]
File for "anger" successfully created at: ./results/lda/lda_ana_anger.txt
fear:
[(0, '0.039*"i" + 0.026*"and" + 0.026*"the" + 0.022*"feel"'),
 (1, '0.076*"i" + 0.032*"feel" + 0.028*"and" + 0.027*"the"'),
 (2, '0.088*"i" + 0.036*"to" + 0.030*"feel" + 0.026*"and"')]
File for "fear" successfully created at: ./results/lda/lda_ana_fear.txt
joy:
[(0, '0.085*"i" + 0.036*"feel" + 0.028*"the" + 0.026*"and"'),
 (1, '0.071*"i" + 0.041*"to" + 0.031*"feel" + 0.028*"the"'),
 (2, '0.066*"i" + 0.042*"and" + 0.040*"feel" + 0.025*"to"')]
File for "joy" successfully created at: ./results/lda/lda_ana_joy.txt
love:
[(0, '0.052*"i" + 0.034*"to" + 0.031*"feel" + 0.022*"and"'),
 (1, '0.079*"i" + 0.040*"feel" + 0.028*"the" + 0.024*"a"'),
 (2, '0.070*"i" + 0.044*"and" + 0.027*"to" + 0.026*"the"')]
File for "love" successfully 

# Exc 3

Inspired from https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [6]:
# Use pyLDAvis to visualize LDA
def visualize_lda(category: str, id2word, corpus, lda_model, num_topics: int):
    # Create the directory if it doesn't exist
    os.makedirs(fs.result_path + 'lda/', exist_ok=True)
    
    filename = fs.result_path + 'lda/ldavis_prepared_' + category + '_' +  str(num_topics)
    LDAvis_data_filepath = os.path.join(filename)
    
    ### this is a bit time consuming - make the if statement True
    ### if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, n_jobs=1)
        
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
    
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    
    html_filename = filename + '.html'
    pyLDAvis.save_html(LDAvis_prepared, html_filename)
    print('File for "' + category + '" successfully created at: ' + html_filename)
    return LDAvis_prepared, html_filename

In [7]:
for category in categories:    
    # Create visualisation
    prepared_data, html_filename = visualize_lda(category=category, id2word=lda_dict[category][0], 
                                                 corpus=lda_dict[category][1], 
                                                 lda_model=lda_dict[category][2], 
                                                 num_topics=num_topics)
    
    # Display the visualization directly in the notebook
    #display(pyLDAvis.display(prepared_data))
    
    # Open the HTML files in a web browser (if running on windows: replace "open " with "start ")
    #os.system("open " + html_filename)

File for "anger" successfully created at: ./results/lda/ldavis_prepared_anger_3.html
File for "fear" successfully created at: ./results/lda/ldavis_prepared_fear_3.html
File for "joy" successfully created at: ./results/lda/ldavis_prepared_joy_3.html
File for "love" successfully created at: ./results/lda/ldavis_prepared_love_3.html
File for "sadness" successfully created at: ./results/lda/ldavis_prepared_sadness_3.html
File for "surprise" successfully created at: ./results/lda/ldavis_prepared_surprise_3.html
