In [1]:
from tqdm import tqdm
from dao.email import AVAILABLE_EMAIL_DAOS
from analysis.attribute_statistics import SimpleLanguageStatistics
from analysis.nlp_transformations import lemmatize_text
from config import init_spacy_english_nlp_model, init_spacy_polish_nlp_model

In [2]:
init_spacy_english_nlp_model()
init_spacy_polish_nlp_model()

In [6]:
dao = AVAILABLE_EMAIL_DAOS['spam_assassin']
query = {'body': {'$regex': "<body>"}, 'is_html': False}
docs = dao.find_many_by_query(query)
for doc in docs:
    dao.update_one({'_id': doc.id}, {'$set': {'is_html': True}})

In [7]:
for dao_name in AVAILABLE_EMAIL_DAOS:
    dao = AVAILABLE_EMAIL_DAOS[dao_name]
    print(dao.collection_name)
    query = {'lemmatized_subject':{'$exists': False}}
    documents = dao.find_many_by_query(query)
    total_documents = len(documents)
    for doc in tqdm(documents, total=total_documents, desc='Lemmatizing texts', unit='emails', miniters=1):
        if doc.is_html:
            if doc.text_plain:
                body = doc.text_plain
            else:
                body = ""
        else:
            if doc.text_plain:
                body = doc.text_plain
            else:
                body = doc.body

        if doc.detected_language == 'pl' or doc.detected_language == 'en':
            lang = doc.detected_language
        else:
            continue # skip non-english and non-polish emails

        if body == "":
            lem_body_str = ""
        else:
            lem_body_str, _ = lemmatize_text(text=body, lang_code=lang)


        lem_subject_str, _ = lemmatize_text(text=doc.subject, lang_code=lang)
        dao.update_one({'_id': doc.id}, {'$set': {'lemmatized_subject': lem_subject_str,
          'lemmatized_body': lem_body_str}})


In [8]:
language_models = {}
chunk_size = 100

In [10]:
for dao_name in AVAILABLE_EMAIL_DAOS:
    dao = AVAILABLE_EMAIL_DAOS[dao_name]
    query = {'$or':[{'detected_language': 'en'},{'detected_language': 'pl'}]}
    total_documents = dao.collection.count_documents(query)
    progress_bar = tqdm(total=total_documents, desc=f"Processing email texts from {dao.collection_name}", unit="emails",
                        miniters=1)
    cursor = dao.collection.find(query).batch_size(chunk_size)
    try:
        documents_processed = 0
        while documents_processed < total_documents:
            documents = list(cursor.next() for _ in range(min(chunk_size, total_documents - documents_processed)))
            for doc in documents:
                text = doc['lemmatized_body'] +" "+ doc['lemmatized_subject']
                lang = doc['detected_language']
                if lang not in language_models:
                    language_models[lang] = SimpleLanguageStatistics(lang)
                language_models[lang].add_texts([text])

            documents_processed += len(documents)
            progress_bar.update(len(documents))
    finally:
        cursor.close()

    progress_bar.close()


In [11]:
language_models

In [17]:
for lang in language_models:
    language_models[lang].save_to_file(f'../data/simple_language_models/{lang}_lang_model.pkl')