In [1]:
from tqdm import tqdm
from dao.email import AVAILABLE_EMAIL_DAOS
from analysis.attribute_statistics import SimpleLanguageStatistics
from analysis.nlp_transformations import lemmatize_text
from config import init_spacy_english_nlp_model, init_spacy_polish_nlp_model

In [2]:
init_spacy_english_nlp_model()
init_spacy_polish_nlp_model()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
for dao_name in AVAILABLE_EMAIL_DAOS:
    dao = AVAILABLE_EMAIL_DAOS[dao_name]
    print(dao.collection_name)
    query = {}
    documents = dao.find_many_by_query(query)
    total_documents = len(documents)
    for doc in tqdm(documents, total=total_documents, desc='Lemmatizing texts', unit='emails', miniters=1):
        if doc.is_html:
            if doc.text_plain:
                body = doc.text_plain
            else:
                body = ""
        else:
            if doc.text_plain:
                body = doc.text_plain
            else:
                body = doc.body

        if doc.detected_language == 'pl' or doc.detected_language == 'en':
            lang = doc.detected_language
        else:
            continue # skip non-english and non-polish emails

        lem_body_str, _ = lemmatize_text(text=body, lang_code=lang)
        lem_subject_str, _ = lemmatize_text(text=doc.subject, lang_code=lang)
        dao.update_one({'_id': doc.id}, {'$set': {'lemmatized_subject': lem_subject_str,
          'lemmatized_body': lem_body_str}})


email_spam_dataset


Lemmatizing texts:  66%|██████▌   | 3759/5728 [27:31<09:26,  3.48emails/s]  

In [None]:
language_models = {}
chunk_size = 100

In [None]:
for dao_name in AVAILABLE_EMAIL_DAOS:
    dao = AVAILABLE_EMAIL_DAOS[dao_name]
    total_documents = dao.collection.count_documents({})
    progress_bar = tqdm(total=total_documents, desc=f"Processing email texts from {dao.collection_name}", unit="emails",
                        miniters=1)
    cursor = dao.collection.find({}).batch_size(chunk_size)
    try:
        documents_processed = 0
        while documents_processed < total_documents:
            documents = list(cursor.next() for _ in range(min(chunk_size, total_documents - documents_processed)))
            for doc in documents:
                if 'detected_language' not in doc:
                    detected_language = 'en'
                else:
                    detected_language = doc['detected_language']

                if detected_language not in language_models:
                    language_models[detected_language] = SimpleLanguageStatistics(detected_language)

                if doc['is_html']:
                    if 'text_plain' in doc:
                        text = doc['text_plain']
                    else:
                        continue
                else:
                    if 'text_plain' in doc:
                        text = doc['text_plain']
                    else:
                        text = doc['body']
                language_models[detected_language].add_texts([text])

            documents_processed += len(documents)
            progress_bar.update(len(documents))
    finally:
        cursor.close()

    progress_bar.close()
