In [1]:
from analysis.attribute_retriving import spelling_and_grammar_check
from tqdm import tqdm
from analysis.nlp_transformations import separate_previous_conversation, remove_footers
from typing import List
from dao.email import DAOEmailGenerated, DAORealEmail

from dao.attribute import DAOAttribute
from analysis.nlp_transformations import replace_links_with_text

[nltk_data] Downloading package stopwords to /home/pawel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/pawel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package pl196x to /home/pawel/nltk_data...
[nltk_data]   Package pl196x is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pawel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from config import init_language_tool_pl, init_language_tool_en
init_language_tool_pl()
init_language_tool_en()

In [3]:
text_pl = "Oto kilkanaście zdań w języku polskim które zawierają różnorodne formy interpunkcyjne i struktury zdaniowe. Jak się dzisiaj czujesz? Dlaczego niebo jest niebieskie? Czy wiesz, że Ks. Jan Twardowski był znanym polskim poetą? Uwaga! Proszę nie biec po korytarzu. Gdzie jest najbliższa apteka? Ile to kosztuje? To niesamowite, że możemy rozmawiać online! Co sądzisz o najnowszych odkryciach w kosmosie? Pamiętaj, że drzwi otwierają się o godz. 8:00 rano! Czy możesz mi podać sól? Kto zostanie nowym prezydentem Stanów Zjednoczonych? Wow! To było naprawdę ekscytujące. Jak możemy efektywniej oszczędzać energię? Czy znasz jakieś dobre przepisy na ciasto? Dr M. Nowak będzie dziś wygłaszać wykład na uniwersytecie. Czy myślisz, że sztuczna inteligencja może przewyższyć ludzką kreatywność?"
spelling_and_grammar_check(text_pl, "pl")

({'PUNCTUATION': 1}, 1)

In [4]:
text = "This is an text that contain several errors. Firstly, the use of incorrect articles and verb forms is common. There also mistakes in tense consistency and punctuation! Secondly, their could be better word choices. Furthermore, few sentences starts with conjunctions, which isn't always suitable. Lastly, is the use of 'an'  before words starting with consonant sounds."
spelling_and_grammar_check(text, "en")

({'MISC': 1,
  'CONFUSED_WORDS': 1,
  'GRAMMAR': 1,
  'PUNCTUATION': 1,
  'TYPOGRAPHY': 1},
 5)

In [5]:
from analysis.attribute_retriving import measure_text_features
from models.attribute import AttributeInDB

dao_generated_emails: DAOEmailGenerated = DAOEmailGenerated()
dao_real_emails: DAORealEmail = DAORealEmail()
dao_attribute: DAOAttribute = DAOAttribute()
english_batch = []
polish_batch = []

all_attributes: List[AttributeInDB] = dao_attribute.find_many_by_query({})

for attribute in tqdm(all_attributes, total=len(all_attributes), desc=f'Calculating spelling and grammar attributes', unit='emails', miniters=1):
    if attribute.referenced_db_name == 'email_generated_dataset':
        og_email = dao_generated_emails.find_one_by_query({'_id': attribute.referenced_doc_id})
        language = og_email.language
    else:
        og_email = dao_real_emails.find_one(attribute.referenced_db_name, {'_id': attribute.referenced_doc_id})
        language = og_email.detected_language
    
    _, og_text = separate_previous_conversation(og_email.text_plain)
    og_text = remove_footers(og_text)
    og_text = replace_links_with_text(og_text, "[link]")
    
    text_errors_by_category, number_of_errors = spelling_and_grammar_check(og_text, language)
    simple_text_features = measure_text_features(og_text)
    dao_attribute.update_one({'_id': attribute.id}, {'$set':{
        'text_errors_by_category': text_errors_by_category,
        'number_of_errors': number_of_errors,
        'double_spaces': simple_text_features['double_spaces'],
        'no_space_after_punctuation': simple_text_features['no_space_after_punctuation'],
        'emojis': simple_text_features['emojis'],
        'question_marks': simple_text_features['question_marks'],
        'exclamation_marks': simple_text_features['exclamation_marks'],
        'double_question_marks': simple_text_features['double_question_marks'],
        'double_exclamation_marks': simple_text_features['double_exclamation_marks'],
    }})


Calculating spelling and grammar attributes: 100%|██████████| 32724/32724 [1:43:27<00:00,  5.27emails/s]  
