In [1]:
from typing import List
from dao.email import DAOEmailGenerated, DAORealEmail

from dao.attribute import DAOAttribute
from models.attribute import AttributeInDB

dao_generated_emails: DAOEmailGenerated = DAOEmailGenerated()
dao_real_emails: DAORealEmail = DAORealEmail()
dao_attribute: DAOAttribute = DAOAttribute()


all_attributes: List[AttributeInDB] = dao_attribute.find_many_by_query({})
all_generated: List[AttributeInDB] = dao_attribute.find_many_by_query({'is_generated': True})
all_real: List[AttributeInDB] = dao_attribute.find_many_by_query({'is_generated': False})

generated_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'is_personal': True})
real_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'is_personal': True})

generated_personal = [attribute for attribute in generated_personal if attribute is not None]
real_personal = [attribute for attribute in real_personal if attribute is not None]

pl_generated: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'pl'})
pl_real: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'pl'})

en_generated: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'en'})
en_real: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'en'})

pl_generated_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'pl', 'is_personal': True})
pl_real_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'pl', 'is_personal': True})

en_generated_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'en', 'is_personal': True})
en_real_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'en', 'is_personal': True})

In [2]:
from analysis.nlp_transformations import remove_stopwords_punctuation_emojis_and_splittings
from collections import Counter

def get_word_counts(attributes: List[AttributeInDB]):
    word_counts = Counter()
    for attribute in attributes:
        text_split_clean = remove_stopwords_punctuation_emojis_and_splittings(attribute.lemmatized_text, attribute.language)
        word_counts.update(text_split_clean)
    return word_counts

In [3]:
word_counts_en_real = get_word_counts(en_real)
word_counts_en_generated = get_word_counts(en_generated)
word_counts_pl_real = get_word_counts(pl_real)
word_counts_pl_generated = get_word_counts(pl_generated)

word_counts_en_real_personal = get_word_counts(en_real_personal)
word_counts_en_generated_personal = get_word_counts(en_generated_personal)
word_counts_pl_real_personal = get_word_counts(pl_real_personal)
word_counts_pl_generated_personal = get_word_counts(pl_generated_personal)


In [11]:
from wordcloud import WordCloud
import re
import matplotlib.pyplot as plt

def create_wordcloud(word_counts, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, size=20)
    plt.axis('off')
    plt.show()

In [12]:
create_wordcloud(word_counts_en_real_personal, 'English real personal')

In [14]:
create_wordcloud(word_counts_en_generated_personal, 'English generated personal')

In [15]:
create_wordcloud(word_counts_pl_real_personal, 'Polish real personal')

In [16]:
create_wordcloud(word_counts_pl_generated_personal, 'Polish generated personal')