In [1]:
from tqdm import tqdm
from dao.email import AVAILABLE_EMAIL_DAOS
from analysis.attribute_statistics import SimpleLanguageStatistics
from analysis.nlp_transformations import lemmatize_text
from config import init_spacy_english_nlp_model, init_spacy_polish_nlp_model

In [2]:
init_spacy_english_nlp_model()
init_spacy_polish_nlp_model()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [6]:
dao = AVAILABLE_EMAIL_DAOS['spam_assassin']
query = {'body': {'$regex': "<body>"}, 'is_html': False}
docs = dao.find_many_by_query(query)
for doc in docs:
    dao.update_one({'_id': doc.id}, {'$set': {'is_html': True}})

In [7]:
for dao_name in AVAILABLE_EMAIL_DAOS:
    dao = AVAILABLE_EMAIL_DAOS[dao_name]
    print(dao.collection_name)
    query = {'lemmatized_subject':{'$exists': False}}
    documents = dao.find_many_by_query(query)
    total_documents = len(documents)
    for doc in tqdm(documents, total=total_documents, desc='Lemmatizing texts', unit='emails', miniters=1):
        if doc.is_html:
            if doc.text_plain:
                body = doc.text_plain
            else:
                body = ""
        else:
            if doc.text_plain:
                body = doc.text_plain
            else:
                body = doc.body

        if doc.detected_language == 'pl' or doc.detected_language == 'en':
            lang = doc.detected_language
        else:
            continue # skip non-english and non-polish emails

        if body == "":
            lem_body_str = ""
        else:
            lem_body_str, _ = lemmatize_text(text=body, lang_code=lang)


        lem_subject_str, _ = lemmatize_text(text=doc.subject, lang_code=lang)
        dao.update_one({'_id': doc.id}, {'$set': {'lemmatized_subject': lem_subject_str,
          'lemmatized_body': lem_body_str}})


email_spam_dataset


Lemmatizing texts: 0emails [00:00, ?emails/s]


email_classification_github


Lemmatizing texts: 0emails [00:00, ?emails/s]

email_spam_assassin_dataset



Lemmatizing texts: 100%|██████████| 7440/7440 [2:52:31<00:00,  1.39s/emails]   


gmail1


Lemmatizing texts: 100%|██████████| 31836/31836 [1:33:53<00:00,  5.65emails/s]   


gmail2


Lemmatizing texts: 100%|██████████| 2817/2817 [33:43<00:00,  1.39emails/s]  


gmail3


Lemmatizing texts: 100%|██████████| 4123/4123 [23:57<00:00,  2.87emails/s]  


In [8]:
language_models = {}
chunk_size = 100

In [10]:
for dao_name in AVAILABLE_EMAIL_DAOS:
    dao = AVAILABLE_EMAIL_DAOS[dao_name]
    query = {'$or':[{'detected_language': 'en'},{'detected_language': 'pl'}]}
    total_documents = dao.collection.count_documents(query)
    progress_bar = tqdm(total=total_documents, desc=f"Processing email texts from {dao.collection_name}", unit="emails",
                        miniters=1)
    cursor = dao.collection.find(query).batch_size(chunk_size)
    try:
        documents_processed = 0
        while documents_processed < total_documents:
            documents = list(cursor.next() for _ in range(min(chunk_size, total_documents - documents_processed)))
            for doc in documents:
                text = doc['lemmatized_body'] +" "+ doc['lemmatized_subject']
                lang = doc['detected_language']
                if lang not in language_models:
                    language_models[lang] = SimpleLanguageStatistics(lang)
                language_models[lang].add_texts([text])

            documents_processed += len(documents)
            progress_bar.update(len(documents))
    finally:
        cursor.close()

    progress_bar.close()



Processing email texts from email_spam_dataset:   0%|          | 0/5728 [00:36<?, ?emails/s][A

Processing email texts from email_spam_dataset:  24%|██▍       | 1400/5728 [00:00<00:00, 12769.88emails/s][A
Processing email texts from email_spam_dataset:  44%|████▎     | 2500/5728 [00:00<00:00, 11724.31emails/s][A
Processing email texts from email_spam_dataset:  63%|██████▎   | 3600/5728 [00:00<00:00, 11221.61emails/s][A
Processing email texts from email_spam_dataset: 100%|██████████| 5728/5728 [00:00<00:00, 11116.75emails/s][A
Processing email texts from email_classification_github: 100%|██████████| 1189/1189 [00:00<00:00, 32763.26emails/s]
Processing email texts from email_spam_assassin_dataset: 100%|██████████| 13239/13239 [00:01<00:00, 7818.74emails/s]
Processing email texts from gmail1: 100%|██████████| 31584/31584 [00:04<00:00, 7014.77emails/s] 
Processing email texts from gmail2: 100%|██████████| 2806/2806 [00:00<00:00, 4245.66emails/s]
Processing email texts from gmail3: 10

In [11]:
language_models

{'en': <analysis.attribute_statistics.SimpleLanguageStatistics at 0x7f8d98132680>,
 'pl': <analysis.attribute_statistics.SimpleLanguageStatistics at 0x7f8d48b36080>}

In [17]:
for lang in language_models:
    language_models[lang].save_to_file(f'../data/simple_language_models/{lang}_lang_model.pkl')