In [8]:
from typing import List
from dao.email import DAOEmailGenerated, DAORealEmail

from dao.attribute import DAOAttributePL, DAOAttributeEN

from ml.model_training import evaluate_models, k_fold_cross_validation
from ml.data_preparation import convert_db_attributes_to_input_data

[nltk_data] Downloading package stopwords to /home/pawel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/pawel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package pl196x to /home/pawel/nltk_data...
[nltk_data]   Package pl196x is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pawel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
from models.attribute import AttributeENInDB, AttributePLInDB

dao_generated_emails: DAOEmailGenerated = DAOEmailGenerated()
dao_real_emails: DAORealEmail = DAORealEmail()

dao_attribute_pl: DAOAttributePL = DAOAttributePL()
dao_attribute_en: DAOAttributeEN = DAOAttributeEN()

pl_generated: List[AttributePLInDB] = dao_attribute_pl.find_many_by_query({"is_generated": True, 'language': 'pl'})
pl_real: List[AttributePLInDB] = dao_attribute_pl.find_many_by_query({"is_generated": False, 'language': 'pl'})

en_generated: List[AttributeENInDB] = dao_attribute_en.find_many_by_query({"is_generated": True, 'language': 'en'})
en_real: List[AttributeENInDB] = dao_attribute_en.find_many_by_query({"is_generated": False, 'language': 'en'})

pl_generated_personal: List[AttributePLInDB] = dao_attribute_pl.find_many_by_query({"is_generated": True, 'language': 'pl', 'is_personal': True})
pl_real_personal: List[AttributePLInDB] = dao_attribute_pl.find_many_by_query({"is_generated": False, 'language': 'pl', 'is_personal': True})

en_generated_personal: List[AttributeENInDB] = dao_attribute_en.find_many_by_query({"is_generated": True, 'language': 'en', 'is_personal': True})
en_real_personal: List[AttributeENInDB] = dao_attribute_en.find_many_by_query({"is_generated": False, 'language': 'en', 'is_personal': True})

In [3]:
generated = en_generated_personal
real = en_real_personal
num_of_features = 245

data = convert_db_attributes_to_input_data(generated, real, num_of_features)

In [6]:
from ml.data_preparation import find_significant_features

data_temp = [(x.to_flat_dict_normalized(), 1) for x in generated]
data_temp += [(x.to_flat_dict_normalized(), 0) for x in real]
# replace None with 0
for i in range(len(data_temp)):
    for key in data_temp[i][0].keys():
        if data_temp[i][0][key] is None:
            data_temp[i][0][key] = 0

significant_features = find_significant_features(data_temp)
significant_features

['number_of_errors',
 'standard_deviation_word_char_length',
 'variance_word_char_length',
 'no_space_after_punctuation',
 'standard_deviation_sentence_char_length',
 'variance_sentence_char_length',
 'variance_sentence_word_length',
 'standard_deviation_sentence_word_length',
 'stylometrix_metrics.statistics.ST_SENT_D_NP',
 'double_spaces',
 'punctuation_per_sentence',
 'stylometrix_metrics.statistics.ST_SENT_D_PP',
 'text_errors_by_category.TYPOS',
 'average_word_char_length',
 'stylometrix_metrics.statistics.ST_SENT_WRDSPERSENT',
 'punctuation_density',
 'stylometrix_metrics.syntactic.SY_INVERSE_PATTERNS',
 'perplexity',
 'stylometrix_metrics.statistics.ST_SENT_D_VP',
 'average_sentence_char_length',
 'burstiness2',
 'stylometrix_metrics.statistics.ST_SENT_D_ADVP',
 'stylometrix_metrics.statistics.ST_SENT_DIFFERENCE',
 'stylometrix_metrics.statistics.ST_TYPE_TOKEN_RATIO_LEMMAS',
 'average_sentence_word_length',
 'text_errors_by_category.TYPOGRAPHY',
 'stylometrix_metrics.statistics.

In [9]:
print(len(generated))
print(len(real))
print(len(generated) + len(real))
data[0][0]

10618
9885
20503


{'standard_deviation_sentence_char_length': 53.85783756025363,
 'variance_sentence_char_length': 2900.6666666666665}

In [5]:
model_valuation = evaluate_models(data)
print(model_valuation)



{'Decision Tree': {'accuracy': 0.9735002438627866, 'precision': 0.9825918762088974, 'recall': 0.9654735508394044, 'f1_score': 0.9739575011982745, 'roc_auc': 0.9737187393475579, 'TP': 3048, 'TN': 2940, 'FP': 54, 'FN': 109}, 'Random Forest': {'accuracy': 0.9826044545602342, 'precision': 0.9894094993581515, 'recall': 0.9765600253405131, 'f1_score': 0.9829427706041767, 'roc_auc': 0.9974628805705594, 'TP': 3083, 'TN': 2961, 'FP': 33, 'FN': 74}, 'MLP Classifier': {'accuracy': 0.9442367094781337, 'precision': 0.9461001902346227, 'recall': 0.9452011403230915, 'f1_score': 0.9456504515924576, 'roc_auc': 0.9843129401025681, 'TP': 2984, 'TN': 2824, 'FP': 170, 'FN': 173}, 'K-Nearest Neighbors': {'accuracy': 0.8253942448382376, 'precision': 0.8119197364480384, 'recall': 0.8587266392144441, 'f1_score': 0.8346674876847291, 'roc_auc': 0.9099068160605871, 'TP': 2711, 'TN': 2366, 'FP': 628, 'FN': 446}, 'Support Vector Classifier': {'accuracy': 0.666233132823931, 'precision': 0.6069767441860465, 'recall':

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [5]:


models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'MLP Classifier': MLPClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    #'Support Vector Classifier': SVC(probability=True),
    'AdaBoost': AdaBoostClassifier(),
    'Gaussian Naive Bayes': GaussianNB(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis()
}

for model_name, model in models.items():
    #print(f"Model: {model_name}")
    results = k_fold_cross_validation(model, data, 10)
    print(f"{model_name} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
    #print(results)
    #print("\n")


Decision Tree & 0.9757 & 0.9843 & 0.9685 & 0.9763 & 0.976
Random Forest & 0.9884 & 0.9931 & 0.9845 & 0.9888 & 0.9995
MLP Classifier & 0.8516 & 0.8617 & 0.8684 & 0.8577 & 0.9336
K-Nearest Neighbors & 0.841 & 0.8319 & 0.8685 & 0.8497 & 0.9311




AdaBoost & 0.9746 & 0.9728 & 0.9784 & 0.9756 & 0.9972
Gaussian Naive Bayes & 0.7199 & 0.6503 & 0.9934 & 0.7859 & 0.8845




Quadratic Discriminant Analysis & 0.6867 & 0.6233 & 0.9984 & 0.7674 & 0.6768




In [16]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
generated = pl_generated_personal
real = pl_real_personal

In [10]:
all_error_features=['number_of_errors', 'no_space_after_punctuation', 'double_spaces', 'text_errors_by_category.AMERICAN_ENGLISH_STYLE',
    'text_errors_by_category.BRITISH_ENGLISH',
    'text_errors_by_category.CASING',
    'text_errors_by_category.COLLOCATIONS',
    'text_errors_by_category.COMPOUNDING',
    'text_errors_by_category.CONFUSED_WORDS',
    'text_errors_by_category.GRAMMAR',
    'text_errors_by_category.MISC',
    'text_errors_by_category.MULTITOKEN_SPELLING',
    'text_errors_by_category.NONSTANDARD_PHRASES',
    'text_errors_by_category.NUMBERS',
    'text_errors_by_category.PHONETICS',
    'text_errors_by_category.PRAWDOPODOBNE_LITEROWKI',
    'text_errors_by_category.PUNCTUATION',
    'text_errors_by_category.REDUNDANCY',
    'text_errors_by_category.REPETITIONS_STYLE',
    'text_errors_by_category.SEMANTICS',
    'text_errors_by_category.SPELLING',
    'text_errors_by_category.STYLE',
    'text_errors_by_category.SYNTAX',
    'text_errors_by_category.TYPOGRAPHY',
    'text_errors_by_category.TYPOS',
    'text_errors_by_category.WORD_ORDER']

In [11]:
len(all_error_features)

26

In [69]:
for n in [30]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=all_error_features)
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    n = len(data[0][0])
    print(f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
    #print(f"[{n} , {round(results['accuracy'], 4)}  , {round(results['roc_auc'], 4)}],")



30 & 0.9223 & 0.9057 & 0.9449 & 0.9242 & 0.9755


In [12]:
generated = pl_generated_personal
real = pl_real_personal

In [13]:
for n in [1,2,4,6,8,10,14,20,30,100,244]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=[])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    n = len(data[0][0])
    print(f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")

10 & 0.9243 & 0.9245 & 0.9268 & 0.925 & 0.9769
244 & 0.944 & 0.9391 & 0.9543 & 0.9459 & 0.9865


In [12]:
spelling_related_mistakes = [
    'text_errors_by_category.MULTITOKEN_SPELLING',  # Mistakes involving the incorrect spelling of multi-word phrases.
    'text_errors_by_category.PRAWDOPODOBNE_LITEROWKI',  # Probable typos or misspellings (term in Polish).
    'text_errors_by_category.SPELLING',  # General spelling mistakes.
    'text_errors_by_category.TYPOS'  # Typographical errors, often simple and easy to spot.
]

editing_related_mistakes = [
    'no_space_after_punctuation',  # Missing spaces after punctuation marks like periods, commas, or colons.
    'double_spaces',  # Unintended multiple spaces between words or sentences.
    'text_errors_by_category.AMERICAN_ENGLISH_STYLE',  # Inconsistencies or errors related to American English conventions (e.g., spelling, punctuation, and usage).
    'text_errors_by_category.BRITISH_ENGLISH',  # Inconsistencies or errors related to British English conventions.
    'text_errors_by_category.CASING',  # Incorrect use of uppercase and lowercase letters.
    'text_errors_by_category.COLLOCATIONS',  # Incorrect or awkward combinations of words that are typically used together.
    'text_errors_by_category.COMPOUNDING',  # Errors in forming compound words (e.g., "ice cream" vs. "ice-cream").
    'text_errors_by_category.CONFUSED_WORDS',  # Misuse of words that sound similar or have similar spellings but different meanings (e.g., "affect" vs. "effect").
    'text_errors_by_category.NONSTANDARD_PHRASES',  # Use of phrases that are not standard or widely accepted.
    'text_errors_by_category.PUNCTUATION',  # Incorrect or missing punctuation marks.
    'text_errors_by_category.REDUNDANCY',  # Unnecessary repetition of words or information.
    'text_errors_by_category.REPETITIONS_STYLE',  # Unintentional repetition of words or phrases that affect the style and flow of the text.
    'text_errors_by_category.STYLE',  # Issues related to the overall writing style, including tone, clarity, and consistency.
    'text_errors_by_category.TYPOGRAPHY',  # Errors related to the visual appearance of text, including font, size, spacing, and alignment.
    'text_errors_by_category.WORD_ORDER',  # Incorrect or awkward arrangement of words in a sentence.
    'text_errors_by_category.NUMBERS',  # Errors related to the use of numbers, such as formatting or incorrect numerical expressions.
    'text_errors_by_category.PHONETICS'  # Errors related to the phonetic aspects of words, such as incorrect pronunciation guides.
]

grammar_mistakes = [
    'text_errors_by_category.GRAMMAR',  # Mistakes related to the rules of grammar, such as subject-verb agreement, tense usage, and sentence structure.
    'text_errors_by_category.SYNTAX'  # Errors in the arrangement of words and phrases to create well-formed sentences.
]

other_mistakes = [
    'number_of_errors',  # A general count of errors, not specifying the type.
    'text_errors_by_category.MISC',  # Miscellaneous errors that don't fit into other categories.
    'text_errors_by_category.SEMANTICS'
]



In [21]:
print(f"Spelling related mistakes: {len(spelling_related_mistakes)}")
print(f"Editing related mistakes: {len(editing_related_mistakes)}")
print(f"Grammar mistakes: {len(grammar_mistakes)}")
print(f"Other mistakes: {len(other_mistakes)}")


Spelling related mistakes: 4
Editing related mistakes: 17
Grammar mistakes: 2
Other mistakes: 3


In [14]:
generated = en_generated_personal
real = en_real_personal

In [17]:
to_exclude = spelling_related_mistakes + other_mistakes
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=all_error_features)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=to_exclude)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No spelling mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
to_exclude = editing_related_mistakes + other_mistakes
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=to_exclude)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No editing mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
to_exclude = grammar_mistakes + other_mistakes
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=to_exclude)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No grammar mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")

No mistakes
218 & 0.984 & 0.9881 & 0.981 & 0.9845 & 0.9987
No spelling mistakes
237 & 0.9861 & 0.9905 & 0.9827 & 0.9866 & 0.9992
No editing mistakes
224 & 0.9859 & 0.9901 & 0.9825 & 0.9863 & 0.9991
No grammar mistakes
239 & 0.9878 & 0.9916 & 0.9847 & 0.9881 & 0.9994


In [19]:
generated = pl_generated_personal
real = pl_real_personal

In [20]:
to_exclude = spelling_related_mistakes + other_mistakes
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=all_error_features)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=to_exclude)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No spelling mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
to_exclude = editing_related_mistakes + other_mistakes
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=to_exclude)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No editing mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
to_exclude = grammar_mistakes + other_mistakes
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=to_exclude)
    n = len(data[0][0])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print("No grammar mistakes")
    print(
        f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")

No mistakes
194 & 0.9202 & 0.8935 & 0.9575 & 0.9235 & 0.9747
No spelling mistakes
213 & 0.9389 & 0.9278 & 0.956 & 0.9413 & 0.9857
No editing mistakes
200 & 0.9192 & 0.8942 & 0.9549 & 0.9227 & 0.9773
No grammar mistakes
215 & 0.941 & 0.9355 & 0.9524 & 0.9432 & 0.9876
