# General

Relevant analysis in those tasks were done with preprocessed data as well. Those files have the extension '\_preprossed at' the end.

# Exc 2

**In this exercise we first created datafiles for future use. The data is stored in the form of DataFrames.**
**For the next step we performed the LDA Analysis to find 3 topics with 4 keywords for each category.**

In [1]:
# necessary imports for exercise 2

import functions_and_variables as fs # file with useful functions and a few variables
import os
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords, words

#nltk.download('words')
#nltk.download('punkt')

from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis

from IPython.display import HTML

### Create csv files 

The data is read from the text files and stored in the typical DataFrame form {category: [tweets]}.

In [2]:
# function to read and concatenate a file and save it as dataframe into a .csv
def concat_categories_to_file(file_path, file_name, export_path):
    data = pd.read_csv("{}/{}.txt".format(file_path,file_name),sep=';')
    tweets = data.iloc[:, 0]
    categories = data.iloc[:, 1]
    
    category_tweets = {}
    for category, tweet in zip(categories, tweets):
        if category not in category_tweets:
            category_tweets[category] = []
        category_tweets[category].append(tweet)

    result = pd.DataFrame(category_tweets.items(),columns=['Category', 'Concatenated_Tweets'])
    result.to_csv('{}/{}.csv'.format(export_path,file_name), index=False, encoding='utf-8')
    print('Dataframes [key: sentences] for {} were successfully created and stored as .csv'.format(file_name))
    return category_tweets

for file in ['test','val','train','complete']:
    if file == 'complete':
        dataframe_categories = concat_categories_to_file('data',file,'categories')
    else:
        concat_categories_to_file('data',file,'categories')

dataframe_categories = dict(sorted(dataframe_categories.items()))
categories = dataframe_categories.keys()

Dataframes [key: sentences] for test were successfully created and stored as .csv
Dataframes [key: sentences] for val were successfully created and stored as .csv
Dataframes [key: sentences] for train were successfully created and stored as .csv
Dataframes [key: sentences] for complete were successfully created and stored as .csv


### Perform the Latent Dirichlet Allocation

The data in this code is not preprocessed but we have run the analysis with preprocessed data as well, so there are files in the same foler with '\_preprocessed' at the end. That can be done by setting _shoud_preprocess=True_. 

Inspired from
https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [3]:
def perform_lda(category: str, data: list, shoud_preprocess: bool, num_topis: int, num_words: int):
    
    data_words = []
    path_preprocessed = ''

    if not shoud_preprocess:
        data_words = [word_tokenize(sentence) for sentence in data]

    else:
        data_words.append(fs.preprocess(data))
        path_preprocessed = '_preprocessed'
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    
    # Create Corpus
    texts = data_words
    
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,
                                           num_topics=num_topics)

    # Print the Keyword with the specified number of words
    print(category + ":")
    keywords = lda_model.print_topics(num_words=num_words)
    pprint(keywords)
    file_path = fs.result_path + 'lda/lda_ana_' + category + path_preprocessed + '.csv'
    fs.write_to_file(file_path=file_path, 
                     content=str(keywords), new=True)
    print('File for "' + category +'" successfully created at: ' + file_path)
    return id2word, corpus, lda_model

num_topics = 3
num_words = 4
lda_dict = dict.fromkeys(dataframe_categories.keys(), [])
pyLDAvis.enable_notebook()

index = 0
for category in categories:    
    # Perform LDA for the current topic
    lda_dict[category] = perform_lda(category=category, 
                                             data=dataframe_categories[category], shoud_preprocess=False, 
                                             num_topis=num_topics, num_words=num_words)

anger:
[(0, '0.083*"i" + 0.032*"and" + 0.029*"feel" + 0.024*"the"'),
 (1, '0.070*"i" + 0.034*"the" + 0.032*"to" + 0.026*"feel"'),
 (2, '0.083*"i" + 0.040*"feel" + 0.026*"to" + 0.026*"and"')]
File for "anger" successfully created at: ./results/lda/lda_ana_anger.csv
fear:
[(0, '0.085*"i" + 0.035*"to" + 0.034*"and" + 0.030*"feel"'),
 (1, '0.058*"i" + 0.026*"the" + 0.025*"a" + 0.024*"feel"'),
 (2, '0.071*"i" + 0.034*"feel" + 0.026*"the" + 0.024*"feeling"')]
File for "fear" successfully created at: ./results/lda/lda_ana_fear.csv
joy:
[(0, '0.081*"i" + 0.040*"feel" + 0.035*"the" + 0.032*"to"'),
 (1, '0.066*"i" + 0.034*"feel" + 0.032*"and" + 0.028*"to"'),
 (2, '0.075*"i" + 0.028*"and" + 0.026*"feel" + 0.024*"to"')]
File for "joy" successfully created at: ./results/lda/lda_ana_joy.csv
love:
[(0, '0.055*"i" + 0.034*"and" + 0.024*"feel" + 0.021*"to"'),
 (1, '0.072*"i" + 0.034*"the" + 0.033*"and" + 0.027*"feel"'),
 (2, '0.071*"i" + 0.040*"feel" + 0.031*"to" + 0.021*"and"')]
File for "love" succes

### Extra function and creation of new preprocessed files

Those .csv files were created for future use and store data in the same way as mentioned above. However, the tweets are prepocessed before.

In [4]:
def write_preprocessed_csv(file_path, file_name, export_path):
    file = fs.read_file('{}/{}.txt'.format(file_path,file_name),';')
    cleaned_dataframe = {key:[] for key in categories}
    for line in file:
        cleaned_dataframe[line[1]].append(' '.join(fs.preprocess(line[0], False)))

    result = pd.DataFrame(cleaned_dataframe.items(),columns=['Category', 'Concatenated_Tweets'])
    result.to_csv('{}/{}_preprocessed.csv'.format(export_path,file_name), index=False, encoding='utf-8')
    print('File for "' + file_name +'" successfully created at: ' + export_path)
    
for file in ['test','val','train','complete']:
    write_preprocessed_csv('data',file,'categories')

File for "test" successfully created at: categories
File for "val" successfully created at: categories
File for "train" successfully created at: categories
File for "complete" successfully created at: categories


# Exc 3

Inspired from https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0, we created a visualization using **pyLDAvis** to understand the intertopic relationsships as well as interpret the individual topics. These can either be opended from the folder, or directly via the notebook by removing the comments mentioned below.
However, we also calculated the **intertopic distances** by using the amount of **common keywords** as well as the **cosine similarity** of the corresponding embedding vectors

In [5]:
import re
import ast
from scipy.spatial import distance
import numpy as np

## pyLDAvis

For more information refer to the chapter **Analyzing LDA model results** at the above link.

In [6]:
# Use pyLDAvis to visualize LDA
def visualize_lda(category: str, id2word, corpus, lda_model, num_topics: int):
    # Create the directory if it doesn't exist
    os.makedirs(fs.result_path + 'lda/model/', exist_ok=True)
    
    filename = fs.result_path + 'lda/model/ldavis_prepared_' + category + '_' +  str(num_topics)
    LDAvis_data_filepath = os.path.join(filename)
    
    ### this is a bit time consuming - make the if statement True
    ### if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, n_jobs=1)
        
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
    
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    
    html_filename = filename + '.html'
    pyLDAvis.save_html(LDAvis_prepared, html_filename)
    print('File for "' + category + '" successfully created at: ' + html_filename)
    os. remove(LDAvis_data_filepath)
    return LDAvis_prepared, html_filename

for category in categories:    
    # Create visualisation
    prepared_data, html_filename = visualize_lda(category=category, id2word=lda_dict[category][0], 
                                                 corpus=lda_dict[category][1], 
                                                 lda_model=lda_dict[category][2], 
                                                 num_topics=num_topics)
    
    # Display the visualization directly in the notebook
    #display(pyLDAvis.display(prepared_data))
    
    # Open the HTML files in a web browser (if running on windows: replace "open " with "start ")
    #os.system("open " + html_filename)

File for "anger" successfully created at: ./results/lda/model/ldavis_prepared_anger_3.html
File for "fear" successfully created at: ./results/lda/model/ldavis_prepared_fear_3.html
File for "joy" successfully created at: ./results/lda/model/ldavis_prepared_joy_3.html
File for "love" successfully created at: ./results/lda/model/ldavis_prepared_love_3.html
File for "sadness" successfully created at: ./results/lda/model/ldavis_prepared_sadness_3.html
File for "surprise" successfully created at: ./results/lda/model/ldavis_prepared_surprise_3.html


## Intertopic distances 

For **I1** (common keywords): The algorithm currently returns the total **amount**. This can easily be changed to a relative measure by removing a comment in the code below.

For **I2** (semantic similarity): The embedding vectors are the **average word2vec embeddings** of the four keywords of a topic. The embedding vectors are then used to claculate the semantic similarity (using **cosine similarity**) between all topic pairs.

In [7]:
# clean keywords by extracing special characters and numbers
def clean_keywords(keywords):
    words = []
    for word in keywords:
        word = ''.join(char for char in word if char.isalpha())
        words.append(str(word))
    return words

# calculate the total amount of common keywords (measure1) and the cosine similarity(measure2)
def calculate_closeness(topic1, topic2, model, vector_size):
    topic1_words = clean_keywords(topic1.split('+'))
    topic2_words = clean_keywords(topic2.split('+'))
    
    #### I1 - count common keywords of topics ####
    i1 = 0;
    for word in topic1_words:
        if word in topic2_words: 
            i1 += 2
    
    # transform total to relative measure
    #measure1 = points/0.5*len(topic1 + topic2) # get percentage
    
    #### I2 - semantic similarity ####
    # calculate avg. word2vec embeddings per topic
    avg_topic_vector1 = np.empty(vector_size)
    for word in topic1_words:
        avg_topic_vector1 += model.wv[word]
    avg_topic_vector1 /= len(topic1_words)
    
    avg_topic_vector2 = np.empty(vector_size)
    for word in topic2_words:
        avg_topic_vector2 += model.wv[word]
    avg_topic_vector2 /= len(topic2_words)
    
    # calculate cosine similarity
    i2 = 1 - distance.cosine(avg_topic_vector1, avg_topic_vector2)
    
    return i1, i2

# function to display DataFrames horizontally, require a list of DataFrames
def horizontal(dfs):
    html = '<div style="display:flex">'
    for df in dfs:
        html += '<div style="margin-right: 32px">'
        html += df.to_html()
        html += '</div>'
    html += '</div>'
    display(HTML(html))


df_list = [] # List of DataFrames for each category
index = 0 # index to iterate through dataframe
vector_size = 1000 # vector size for model

# loop through the categories and calculate pairwise comparison of topics
for category in categories:
    closeness_data = pd.DataFrame(columns=['category','topics','I1','I2']) # dataframe to store values in

    # get lda_resut for the current category and transfer it to tuples
    with open('./results/lda/lda_ana_' + category + '.csv', 'r') as file:
        lda_data = file.readline()
    topics = ast.literal_eval(lda_data)
    
    # get training data for model (dependent on categorie) 
    # and transform it to a list of words per sentence
    model_data = []    
    for sentence in dataframe_categories[category]:
        model_data.append(sentence.split(' '))
        
    # train model
    model = gensim.models.Word2Vec(sentences=model_data, min_count=1, 
                              vector_size=vector_size, window=5)
            
    # calculate closeness of topics and identify highest values
    max_i1 = 0
    max_i2 = 0
    for i in range(len(topics)):
        for j in range(i + 1, len(topics)):
            closeness = calculate_closeness(topic1=topics[i][1], topic2=topics[j][1], 
                                            model=model, vector_size=vector_size)
            closeness_data.loc[index] = category, f"T{i}&T{j}", closeness[0], closeness[1]
            index += 1
            max_i1 = closeness[0] if closeness[0] > max_i1 else max_i1
            max_i2 = closeness[1] if closeness[1] > max_i2 else max_i2
        closeness_data.loc[index] = category, "max_values", max_i1, max_i2
    
    # add DataFrame with the topic data to the list
    df_list.append(closeness_data)
    
    # print as many dataframes in a row as there are topics
    list_index = int(index/len(topics)) - 1
    if (index) % (len(topics)**2) == 0:
        horizontal(df_list[list_index- (len(topics) - 1):list_index+1])
    elif index + 2 > len(topics) * len(categories) - 1:
        horizontal(df_list[list_index:])

Unnamed: 0,category,topics,I1,I2
0,anger,T0&T1,6,0.999999
1,anger,T0&T2,6,1.0
2,anger,T1&T2,6,0.999895
3,anger,max_values,6,1.0

Unnamed: 0,category,topics,I1,I2
3,fear,T0&T1,4,0.999995
4,fear,T0&T2,4,0.999997
5,fear,T1&T2,6,0.999843
6,fear,max_values,6,0.999997

Unnamed: 0,category,topics,I1,I2
6,joy,T0&T1,6,0.999976
7,joy,T0&T2,6,0.999986
8,joy,T1&T2,8,0.999813
9,joy,max_values,8,0.999986


Unnamed: 0,category,topics,I1,I2
9,love,T0&T1,6,1.0
10,love,T0&T2,8,1.0
11,love,T1&T2,6,0.999351
12,love,max_values,8,1.0

Unnamed: 0,category,topics,I1,I2
12,sadness,T0&T1,8,0.999969
13,sadness,T0&T2,6,0.999942
14,sadness,T1&T2,6,0.970296
15,sadness,max_values,8,0.999969

Unnamed: 0,category,topics,I1,I2
15,surprise,T0&T1,6,0.999999
16,surprise,T0&T2,8,0.999999
17,surprise,T1&T2,6,0.999867
18,surprise,max_values,8,0.999999


# Exercise 4

In this Exercise we calculated the **linguistic quality ratio** for the given text. As the data from Kaggle was already preprocessed, we expected and got 0 values for symbols, links etc. However, as the input data are tweets, the quality is pretty **low**. There are many stopwords and words that are not in wordnet.

In [9]:
# function to calculate linguistic quality ratio for a given text
def calculate_linguistic_quality(tweets):
    tokens = []
    for sentence in tweets:
        tokens.extend(word_tokenize(sentence))
    
    # Count the total number of tokens
    total_tokens = len(tokens)

    # Initialize counters for stopwords, words not in WordNet, symbols, links, and numerals
    stopwords_count = 0
    not_in_wordnet_count = 0
    symbols_count = 0
    links_count = 0
    numerals_count = 0

    # Define the set of English stopwords
    english_stopwords = set(stopwords.words('english'))

    # Define the set of English words in WordNet
    english_words = set(words.words())

    # check each token for
    for token in tokens:
        # stopwords
        if token in english_stopwords:
            stopwords_count += 1
        # not in WordNet
        if token not in english_words:
            not_in_wordnet_count += 1
        # symbols
        if not token.isalnum():
            symbols_count += 1
        # links
        if token.startswith("http://") or token.startswith("https://"):
            links_count += 1
        # numerals
        if token.isnumeric():
            numerals_count += 1

    # calculate linguistic quality ratio
    cleaned_tokens = total_tokens - stopwords_count - not_in_wordnet_count - symbols_count - links_count - numerals_count
    linguistic_quality_ratio = cleaned_tokens / total_tokens if total_tokens > 0 else 0.0

    return total_tokens, linguistic_quality_ratio, stopwords_count, not_in_wordnet_count, symbols_count + links_count + numerals_count

# calculate linguistic quality ratio for each category
linguistic_quality_ratios = {}
for category, tweets in dataframe_categories.items():
    linguistic_quality_ratios[category] = calculate_linguistic_quality(tweets)


display(pd.DataFrame.from_dict(linguistic_quality_ratios, orient='index', columns=['Total tokens', 'Linguistic Quality Ratio', 'stopwords', 'not in wordnet','symbols/numbers/etc']))

Unnamed: 0,Total tokens,Linguistic Quality Ratio,stopwords,not in wordnet,symbols/numbers/etc
anger,52177,0.390057,26879,4946,0
fear,44498,0.398782,22763,3990,0
joy,131401,0.401565,67405,11230,0
love,33833,0.38959,17521,3131,0
sadness,106784,0.400547,54413,9599,0
surprise,14184,0.384729,7260,1467,0
