In [1]:
from typing import List
from dao.email import DAOEmailGenerated, DAORealEmail

from dao.attribute import DAOAttribute

from ml.model_training import evaluate_models, k_fold_cross_validation
from ml.data_preparation import convert_db_attributes_to_input_data

[nltk_data] Downloading package stopwords to /home/pawel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/pawel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package pl196x to /home/pawel/nltk_data...
[nltk_data]   Package pl196x is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pawel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from models.attribute import AttributeInDB

dao_generated_emails: DAOEmailGenerated = DAOEmailGenerated()
dao_real_emails: DAORealEmail = DAORealEmail()
dao_attribute: DAOAttribute = DAOAttribute()


all_attributes: List[AttributeInDB] = dao_attribute.find_many_by_query({})
all_generated: List[AttributeInDB] = dao_attribute.find_many_by_query({'is_generated': True})
all_real: List[AttributeInDB] = dao_attribute.find_many_by_query({'is_generated': False})

generated_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'is_personal': True})
real_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'is_personal': True})

generated_personal = [attribute for attribute in generated_personal if attribute is not None]
real_personal = [attribute for attribute in real_personal if attribute is not None]

pl_generated: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'pl'})
pl_real: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'pl'})

en_generated: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'en'})
en_real: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'en'})

pl_generated_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'pl', 'is_personal': True})
pl_real_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'pl', 'is_personal': True})

en_generated_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": True, 'language': 'en', 'is_personal': True})
en_real_personal: List[AttributeInDB] = dao_attribute.find_many_by_query({"is_generated": False, 'language': 'en', 'is_personal': True})

In [19]:
generated = en_generated_personal
real = en_real_personal
num_of_features = 245

data = convert_db_attributes_to_input_data(generated, real, num_of_features)

In [14]:
from ml.data_preparation import find_significant_features

data_temp = [(x.to_flat_dict_normalized(), 1) for x in generated]
data_temp += [(x.to_flat_dict_normalized(), 0) for x in real]
# replace None with 0
for i in range(len(data_temp)):
    for key in data_temp[i][0].keys():
        if data_temp[i][0][key] is None:
            data_temp[i][0][key] = 0

significant_features = find_significant_features(data_temp)
print(significant_features)

['number_of_errors', 'no_space_after_punctuation', 'variance_word_char_length', 'standard_deviation_word_char_length', 'variance_sentence_char_length', 'standard_deviation_sentence_char_length', 'standard_deviation_sentence_word_length', 'variance_sentence_word_length', 'double_spaces', 'text_errors_by_category.TYPOS', 'punctuation_per_sentence', 'average_word_char_length', 'punctuation_density', 'perplexity', 'average_sentence_char_length', 'burstiness2', 'average_sentence_word_length', 'text_errors_by_category.TYPOGRAPHY', 'burstiness', 'number_of_characters', 'number_of_sentences', 'text_errors_by_category.PUNCTUATION', 'text_errors_by_category.CASING', 'number_of_words', 'question_marks', 'exclamation_marks', 'text_errors_by_category.GRAMMAR', 'text_errors_by_category.MISC', 'text_errors_by_category.REPETITIONS_STYLE', 'text_errors_by_category.REDUNDANCY', 'stylometrix_metrics.syntactic.SY_SUBORD_SENT', 'text_errors_by_category.CONFUSED_WORDS', 'text_errors_by_category.AMERICAN_ENG

In [9]:
print(len(generated))
print(len(real))
print(len(generated) + len(real))
data[0][0]

10618
9885
20503


{'standard_deviation_sentence_char_length': 53.85783756025363,
 'variance_sentence_char_length': 2900.6666666666665}

In [5]:
model_valuation = evaluate_models(data)
print(model_valuation)



{'Decision Tree': {'accuracy': 0.9735002438627866, 'precision': 0.9825918762088974, 'recall': 0.9654735508394044, 'f1_score': 0.9739575011982745, 'roc_auc': 0.9737187393475579, 'TP': 3048, 'TN': 2940, 'FP': 54, 'FN': 109}, 'Random Forest': {'accuracy': 0.9826044545602342, 'precision': 0.9894094993581515, 'recall': 0.9765600253405131, 'f1_score': 0.9829427706041767, 'roc_auc': 0.9974628805705594, 'TP': 3083, 'TN': 2961, 'FP': 33, 'FN': 74}, 'MLP Classifier': {'accuracy': 0.9442367094781337, 'precision': 0.9461001902346227, 'recall': 0.9452011403230915, 'f1_score': 0.9456504515924576, 'roc_auc': 0.9843129401025681, 'TP': 2984, 'TN': 2824, 'FP': 170, 'FN': 173}, 'K-Nearest Neighbors': {'accuracy': 0.8253942448382376, 'precision': 0.8119197364480384, 'recall': 0.8587266392144441, 'f1_score': 0.8346674876847291, 'roc_auc': 0.9099068160605871, 'TP': 2711, 'TN': 2366, 'FP': 628, 'FN': 446}, 'Support Vector Classifier': {'accuracy': 0.666233132823931, 'precision': 0.6069767441860465, 'recall':

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [21]:


models = {
    # 'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    # 'MLP Classifier': MLPClassifier(),
    # 'K-Nearest Neighbors': KNeighborsClassifier(),
    # 'Support Vector Classifier': SVC(probability=True),
    # 'AdaBoost': AdaBoostClassifier(),
    # 'Gaussian Naive Bayes': GaussianNB(),
    # 'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis()
}

for model_name, model in models.items():
    #print(f"Model: {model_name}")
    results = k_fold_cross_validation(model, data, 10)
    print(f"{model_name} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")
    #print(results)
    #print("\n")


Random Forest & 0.9848 & 0.9914 & 0.9792 & 0.9852 & 0.999


In [22]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
generated = en_generated_personal
real = en_real_personal

In [25]:
for n in [245]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=['number_of_errors', 'no_space_after_punctuation', 'double_spaces', 'text_errors_by_category.AMERICAN_ENGLISH_STYLE',
    'text_errors_by_category.BRITISH_ENGLISH',
    'text_errors_by_category.CASING',
    'text_errors_by_category.COLLOCATIONS',
    'text_errors_by_category.COMPOUNDING',
    'text_errors_by_category.CONFUSED_WORDS',
    'text_errors_by_category.GRAMMAR',
    'text_errors_by_category.MISC',
    'text_errors_by_category.MULTITOKEN_SPELLING',
    'text_errors_by_category.NONSTANDARD_PHRASES',
    'text_errors_by_category.NUMBERS',
    'text_errors_by_category.PHONETICS',
    'text_errors_by_category.PRAWDOPODOBNE_LITEROWKI',
    'text_errors_by_category.PUNCTUATION',
    'text_errors_by_category.REDUNDANCY',
    'text_errors_by_category.REPETITIONS_STYLE',
    'text_errors_by_category.SEMANTICS',
    'text_errors_by_category.SPELLING',
    'text_errors_by_category.STYLE',
    'text_errors_by_category.SYNTAX',
    'text_errors_by_category.TYPOGRAPHY',
    'text_errors_by_category.TYPOS',
    'text_errors_by_category.WORD_ORDER'])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    #print(f"{n} {round(results['accuracy'], 4)} {round(results['roc_auc'], 4)}")
    print(f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")



245 & 0.9759 & 0.9825 & 0.9706 & 0.9765 & 0.9977


In [26]:
generated = pl_generated_personal
real = pl_real_personal

In [28]:
for n in [241]:
    data = convert_db_attributes_to_input_data(generated, real, n, exclude_additionally=['number_of_errors', 'no_space_after_punctuation', 'double_spaces', 'text_errors_by_category.AMERICAN_ENGLISH_STYLE',
    'text_errors_by_category.BRITISH_ENGLISH',
    'text_errors_by_category.CASING',
    'text_errors_by_category.COLLOCATIONS',
    'text_errors_by_category.COMPOUNDING',
    'text_errors_by_category.CONFUSED_WORDS',
    'text_errors_by_category.GRAMMAR',
    'text_errors_by_category.MISC',
    'text_errors_by_category.MULTITOKEN_SPELLING',
    'text_errors_by_category.NONSTANDARD_PHRASES',
    'text_errors_by_category.NUMBERS',
    'text_errors_by_category.PHONETICS',
    'text_errors_by_category.PRAWDOPODOBNE_LITEROWKI',
    'text_errors_by_category.PUNCTUATION',
    'text_errors_by_category.REDUNDANCY',
    'text_errors_by_category.REPETITIONS_STYLE',
    'text_errors_by_category.SEMANTICS',
    'text_errors_by_category.SPELLING',
    'text_errors_by_category.STYLE',
    'text_errors_by_category.SYNTAX',
    'text_errors_by_category.TYPOGRAPHY',
    'text_errors_by_category.TYPOS',
    'text_errors_by_category.WORD_ORDER'])
    results = k_fold_cross_validation(RandomForestClassifier(), data, 10)
    print(f"{n} & {round(results['accuracy'], 4)} & {round(results['precision'], 4)} & {round(results['recall'], 4)} & {round(results['f1_score'], 4)} & {round(results['roc_auc'], 4)}")

241 & 0.914 & 0.8937 & 0.9432 & 0.9168 & 0.9761
