# Exc 2

Inspired from
https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [1]:
import functions_and_variables as fs
import os
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem.porter import *
from nltk.corpus import stopwords, words

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('words')

from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [2]:
def concat_categories_to_file(file_path, file_name, export_path):
    data = pd.read_csv("{}/{}.txt".format(file_path,file_name),sep=';')
    tweets = data.iloc[:, 0]
    categories = data.iloc[:, 1]
    
    category_tweets = {}
    for category, tweet in zip(categories, tweets):
        if category not in category_tweets:
            category_tweets[category] = []
        category_tweets[category].append(tweet)

    result = pd.DataFrame(category_tweets.items(),columns=['Category', 'Concatenated_Tweets'])
    result.to_csv('{}/{}.csv'.format(export_path,file_name), index=False, encoding='utf-8')
    print('Dataframes [key: sentences] for {} were successfully created and stored as .csv'.format(file_name))
    return category_tweets

for file in ['test','val','train','complete']:
    if file == 'complete':
        dataframe_categories = concat_categories_to_file('data',file,'categories')
    else:
        concat_categories_to_file('data',file,'categories')

dataframe_categories = dict(sorted(dataframe_categories.items()))
categories = dataframe_categories.keys()

Dataframes [key: sentences] for test were successfully created and stored as .csv
Dataframes [key: sentences] for val were successfully created and stored as .csv
Dataframes [key: sentences] for train were successfully created and stored as .csv
Dataframes [key: sentences] for complete were successfully created and stored as .csv


In [3]:
# Perform Stemming and remove Stopwords
def preprocess(sentences, append):
    cleaned = []
    
    for sentence in sentences:
        if not type(sentences) == list: 
            sentence = sentences
            break_p = True
        stop_words = list(set(stopwords.words('english')))
        stop_words.extend(['im', 'ive','dont','cant'])
        stemmer = PorterStemmer()
    
        word_tokens = word_tokenize(sentence)
        cleaned_words = [stemmer.stem(w) for w in word_tokens if not w.lower() in stop_words]
        if append:
            cleaned.append(cleaned_words)
        else:
            cleaned.extend(cleaned_words)
        if break_p: 
            break
    return cleaned

# Returns the words of a sentence
def sent_to_words(sentences: list):
        return [word_tokenize(sentence) for sentence in sentences]

In [4]:
def perform_lda(category: str, data: list, shoud_preprocess: bool, num_topis: int, num_words: int):
    
    data_words = []

    if not shoud_preprocess:
        data_words = list(sent_to_words(data))

    else:
        data_words.append(preprocess(data))
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    
    # Create Corpus
    texts = data_words
    
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,
                                           num_topics=num_topics)

    # Print the Keyword with the specified number of words
    print(category + ":")
    keywords = lda_model.print_topics(num_words=num_words)
    pprint(keywords)
    file_path = fs.result_path + 'lda/lda_ana_' + category + '.csv'
    fs.write_to_file(file_path=file_path, 
                     content=str(keywords), new=True)
    print('File for "' + category +'" successfully created at: ' + file_path)
    return id2word, corpus, lda_model

In [5]:
num_topics = 3
num_words = 4
lda_dict = dict.fromkeys(dataframe_categories.keys(), [])
pyLDAvis.enable_notebook()

index = 0
for category in categories:    
    # Perform LDA for the current topic
    lda_dict[category] = perform_lda(category=category, 
                                             data=dataframe_categories[category], shoud_preprocess=False, 
                                             num_topis=num_topics, num_words=num_words)

anger:
[(0, '0.071*"i" + 0.028*"to" + 0.027*"feel" + 0.027*"the"'),
 (1, '0.095*"i" + 0.041*"feel" + 0.032*"and" + 0.020*"to"'),
 (2, '0.069*"i" + 0.029*"the" + 0.028*"and" + 0.027*"to"')]
File for "anger" successfully created at: ./results/lda/lda_ana_anger.csv
fear:
[(0, '0.075*"i" + 0.034*"to" + 0.033*"and" + 0.031*"feel"'),
 (1, '0.078*"i" + 0.034*"feel" + 0.029*"the" + 0.024*"a"'),
 (2, '0.076*"i" + 0.029*"to" + 0.029*"and" + 0.025*"feel"')]
File for "fear" successfully created at: ./results/lda/lda_ana_fear.csv
joy:
[(0, '0.074*"i" + 0.039*"the" + 0.035*"feel" + 0.034*"to"'),
 (1, '0.076*"i" + 0.035*"feel" + 0.035*"and" + 0.023*"feeling"'),
 (2, '0.077*"i" + 0.035*"feel" + 0.032*"to" + 0.030*"and"')]
File for "joy" successfully created at: ./results/lda/lda_ana_joy.csv
love:
[(0, '0.053*"i" + 0.031*"the" + 0.030*"feel" + 0.029*"and"'),
 (1, '0.084*"i" + 0.037*"feel" + 0.027*"and" + 0.025*"to"'),
 (2, '0.061*"i" + 0.033*"and" + 0.032*"to" + 0.026*"the"')]
File for "love" successfu

### Extra function and creation of new cleaned files

In [6]:
def write_cleaned_csv(file_path, file_name, export_path):
    file = fs.read_file('{}/{}.txt'.format(file_path,file_name),';')
    cleaned_dataframe = {key:[] for key in categories}
    for line in file[:5]:
        cleaned_dataframe[line[1]].append(' '.join(preprocess(line[0], False)))

    result = pd.DataFrame(cleaned_dataframe.items(),columns=['Category', 'Concatenated_Tweets'])
    result.to_csv('{}/{}_cleaned.csv'.format(export_path,file_name), index=False, encoding='utf-8')
    print('File for "' + file_name +'" successfully created at: ' + export_path)
    
for file in ['test','val','train','complete']:
    write_cleaned_csv('data',file,'categories')

File for "test" successfully created at: categories
File for "val" successfully created at: categories
File for "train" successfully created at: categories
File for "complete" successfully created at: categories


# Exc 3

Inspired from https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [7]:
import re
import ast
from scipy.spatial import distance
import numpy as np

In [8]:
# Use pyLDAvis to visualize LDA
def visualize_lda(category: str, id2word, corpus, lda_model, num_topics: int):
    # Create the directory if it doesn't exist
    os.makedirs(fs.result_path + 'lda/', exist_ok=True)
    
    filename = fs.result_path + 'lda/ldavis_prepared_' + category + '_' +  str(num_topics)
    LDAvis_data_filepath = os.path.join(filename)
    
    ### this is a bit time consuming - make the if statement True
    ### if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, n_jobs=1)
        
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
    
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    
    html_filename = filename + '.html'
    pyLDAvis.save_html(LDAvis_prepared, html_filename)
    print('File for "' + category + '" successfully created at: ' + html_filename)
    return LDAvis_prepared, html_filename

In [9]:
for category in categories:    
    # Create visualisation
    prepared_data, html_filename = visualize_lda(category=category, id2word=lda_dict[category][0], 
                                                 corpus=lda_dict[category][1], 
                                                 lda_model=lda_dict[category][2], 
                                                 num_topics=num_topics)
    
    # Display the visualization directly in the notebook
    #display(pyLDAvis.display(prepared_data))
    
    # Open the HTML files in a web browser (if running on windows: replace "open " with "start ")
    #os.system("open " + html_filename)

File for "anger" successfully created at: ./results/lda/ldavis_prepared_anger_3.html
File for "fear" successfully created at: ./results/lda/ldavis_prepared_fear_3.html
File for "joy" successfully created at: ./results/lda/ldavis_prepared_joy_3.html
File for "love" successfully created at: ./results/lda/ldavis_prepared_love_3.html
File for "sadness" successfully created at: ./results/lda/ldavis_prepared_sadness_3.html
File for "surprise" successfully created at: ./results/lda/ldavis_prepared_surprise_3.html


In [10]:
def clean_keywords(keywords):
    words = []
    for word in keywords:
        word = ''.join(char for char in word if char.isalpha())
        words.append(str(word))
    return words

def calculate_closeness(topic1, topic2, model, vector_size):
    topic1_words = clean_keywords(topic1.split('+'))
    topic2_words = clean_keywords(topic2.split('+'))
    
    # I1
    measure1 = 0;
    for word in topic1_words:
        if word in topic2_words: 
            measure1 += 2
    
    #measure1 = points/0.5*len(topic1 + topic2) # get percentage
    
    # I2
    
    # calculate avg. word2vec embeddings per topic
    avg_topic_vector1 = np.empty(vector_size)
    for word in topic1_words:
        avg_topic_vector1 += model.wv[word]
    avg_topic_vector1 /= len(topic1_words)
    
    avg_topic_vector2 = np.empty(vector_size)
    for word in topic2_words:
        avg_topic_vector2 += model.wv[word]
    avg_topic_vector2 /= len(topic2_words)
    
    measure2 = 1 - distance.cosine(avg_topic_vector1, avg_topic_vector2)
    
    return measure1, measure2

In [11]:
closeness_data = pd.DataFrame(columns=['category','topics','I1','I2']) # dataframe to store values in

index = 0 # index to iterate through dataframe
vector_size = 1000 # vector size for model
for category in categories:
    # get lda_resut for the current category and transfer it to tuples
    with open('./results/lda/lda_ana_' + category + '.csv', 'r') as file:
        lda_data = file.readline()
    topics = ast.literal_eval(lda_data)
    
    # get training data for model and transform it to a list of words per sentence
    model_data = []
    for sentence in dataframe_categories[category]:
        model_data.append(sentence.split(' '))
        
    # train model
    model = gensim.models.Word2Vec(sentences=model_data, min_count=1, 
                              vector_size=vector_size, window=5)
    
    # calculate closeness of topics
    for i in range(len(topics)):
        for j in range(i + 1, len(topics)):
            closeness = calculate_closeness(topic1=topics[i][1], topic2=topics[j][1], 
                                            model=model, vector_size=vector_size)
            closeness_data.loc[index] = category, f"T{i}&T{j}", closeness[0], closeness[1]
            index += 1
display(closeness_data)

Unnamed: 0,category,topics,I1,I2
0,anger,T0&T1,6,0.999999
1,anger,T0&T2,6,0.999996
2,anger,T1&T2,6,0.999892
3,fear,T0&T1,4,0.999996
4,fear,T0&T2,8,0.999487
5,fear,T1&T2,4,0.999258
6,joy,T0&T1,4,0.999668
7,joy,T0&T2,6,0.999864
8,joy,T1&T2,6,0.98642
9,love,T0&T1,6,0.999924


# Exercise 4

In [12]:
# Function to calculate linguistic quality ratio for a given text
def calculate_linguistic_quality(text_data):
    tokens = []
    for sentence in text_data:
        tokens.extend(word_tokenize(sentence))
    tokens = list(set(tokens))
    
    # Count the total number of tokens
    total_tokens = len(tokens)

    # Initialize counters for stopwords, words not in WordNet, symbols, links, and numerals
    stopwords_count = 0
    not_in_wordnet_count = 0
    symbols_count = 0
    links_count = 0
    numerals_count = 0

    # Define the set of English stopwords
    english_stopwords = set(stopwords.words('english'))

    # Define the set of English words in WordNet
    english_words = set(words.words())

    # Check each token
    for token in tokens:
        # Check for stopwords
        if token in english_stopwords:
            stopwords_count += 1
        # Check if the token is not in WordNet
        if token not in english_words:
            not_in_wordnet_count += 1
        # Check for symbols (you can define your own criteria for symbols)
        if not token.isalnum():
            symbols_count += 1
        # Check for links
        if token.startswith("http://") or token.startswith("https://"):
            links_count += 1
        # Check for numerals
        if token.isnumeric():
            numerals_count += 1

    # Calculate the linguistic quality ratio
    cleaned_tokens = total_tokens - stopwords_count - not_in_wordnet_count - symbols_count - links_count - numerals_count
    linguistic_quality_ratio = cleaned_tokens / total_tokens if total_tokens > 0 else 0.0

    return linguistic_quality_ratio

In [13]:
# Calculate linguistic quality ratio for each category
linguistic_quality_ratios = {}
for category, text_data in dataframe_categories.items():
    linguistic_quality_ratios[category] = calculate_linguistic_quality(text_data)

display(pd.DataFrame(list(linguistic_quality_ratios.items()), columns=['category', 'Linguistic Quality Ratio']))

Unnamed: 0,category,Linguistic Quality Ratio
0,anger,0.647572
1,fear,0.659458
2,joy,0.604459
3,love,0.649909
4,sadness,0.628194
5,surprise,0.673271
