# Le but de ce notebook est d'identifier des caractéristiques permettant de différencier des textes écrits par Molière et Corneille

In [None]:
import functools
from typing import List,Set,Tuple,Dict
import os
import pathlib
import re
import os
import random
import math
import pandas as pd

#!pip install unidecode
#!pip install wordcloud
#!pip install pillow

# le répertoire de travail
directory = os.path.abspath('')
random.seed(42)

# retourne tous les fichiers *.txt présents dans le repertoire 'path'
def all_txt_files_in_directory(path: str):
    return [os.path.join(path,f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.txt')]

# infique si la ligne 'ligne' est une ligne valide ou si elle doit être ignorée (par exemple si elle vide)
def is_valid_line(line: str) -> bool:
    if line.startswith('Scène ') or line.startswith('Acte '):
        return False
    if len(line)<10:
        return False
    return True

def split_book_into_paragraphs(path: str, min_paragraph_length) -> List[str]:
    result = []
    with open(path) as file:
        current_paragraph = ""
        for line in file:
            if is_valid_line(line):
                if current_paragraph :
                    current_paragraph += "\n"
                current_paragraph += line.rstrip()
                if len(current_paragraph) >= min_paragraph_length:
                    result.append(current_paragraph) 
                    current_paragraph = ""
    if len(current_paragraph) >= min_paragraph_length or (current_paragraph and len(result) == 0):
        result.append(current_paragraph) 
    return result

def load_all_books(path: str, min_paragraph_length:int) -> Dict[str,List[str]]:
    book_to_paragraphs = dict()
    for book_path in all_txt_files_in_directory(path):
        book_to_paragraphs[pathlib.Path(book_path).stem] = split_book_into_paragraphs(book_path, min_paragraph_length)
    return book_to_paragraphs


# pour supprimer les accents
@functools.lru_cache(maxsize=None)
def remove_diacritics(word: str) -> str:
    import unidecode  
    return unidecode.unidecode(word)

# mots en minuscules
@functools.lru_cache(maxsize=None)
def to_lowercase(word: str) -> str:
    return word.lower()

# pour le stemming
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
@functools.lru_cache(maxsize=None)
def to_stemming(word: str) -> str:
    return stemmer.stem(word)

@functools.lru_cache(maxsize=None)
def normalize_word(word: str, use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> str:
    if use_lowercase:
        word = to_lowercase(word)
    if use_diacritics:
        word = remove_diacritics(word)
    if use_stemming:
        word = to_stemming(word)
    return word

def paragraph_count(book_to_paragraphs: Dict[str, List[str]]) -> int:
    if not book_to_paragraphs:
        return 0
    return sum([len(c) for c in book_to_paragraphs.values()])

def split_text(text:str) -> List[str]:
    return re.findall(r"\b[\w'^\d]+\b", text.rstrip())
            
def word_count(book_to_paragraphs: Dict[str, List[str]]) -> int:
    result = 0
    for paragraph in all_paragraphs(book_to_paragraphs):
        result += len(split_text(paragraph.rstrip()))
    return result

def all_paragraphs(book_to_paragraphs: Dict[str, List[str]]) -> List[str]:
    result = []
    for p in book_to_paragraphs.values():
        result.extend(p)
    return result

# reduce the dataset so that it contains exactly 'target_count' paragraphs
def reduce_to_paragraph_count(book_to_paragraphs: dict, target_count: int ) -> int:
    current_count = paragraph_count(book_to_paragraphs)
    if current_count<target_count:
        raise Exception(f'current_count {current_count} < target_count {target_count}')
    to_remove = current_count-  target_count
    result = dict()
    for book, paragraphs in sorted(book_to_paragraphs.items(), key =lambda x : len(x[1])):
        if len(paragraphs)<=to_remove:
            to_remove-=len(paragraphs)
            continue
        result[book] = paragraphs[:len(paragraphs)-to_remove]
        to_remove = 0
    return result

def compute_most_common_normalized_words(normalized_words_to_stats: dict, count: int) -> Dict[str,float]:
    sorted_by_total_count = sorted(normalized_words_to_stats.items(), key=lambda item:item[1][0], reverse=True)[:count]
    total_count = sum([c[1][0] for c in sorted_by_total_count])
    result  = dict()
    for normalied_word,stats in sorted_by_total_count:
        result[normalied_word] = stats[0]/total_count
    return result

def compute_normalized_words_to_stats(paragraphs: List[str], hyperparameters: dict) -> Dict[str, Dict[int,Dict[str,int]] ]:
    use_lowercase = hyperparameters['use_lowercase']
    use_diacritics = hyperparameters['use_diacritics']
    use_stemming = hyperparameters['use_stemming']
    normalized_words_to_stats = dict()
    for paragraph in paragraphs:
        words = re.findall(r"\b[\w'^\d]+\b", paragraph.rstrip())
        for original_word in words:
            normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
            if normalized_word not in normalized_words_to_stats:
                normalized_words_to_stats[normalized_word] = (0, dict())
            count,original_word_count = normalized_words_to_stats[normalized_word]
            if original_word not in original_word_count:
                original_word_count[original_word] = 1
            else:
                original_word_count[original_word] += 1
            normalized_words_to_stats[normalized_word] = (count+1,original_word_count)
    return normalized_words_to_stats

def get_max_item(dic: Dict[str, int]) -> Tuple[str,int]:
    key_max_count, max_count = (None, None)
    for key,count in dic.items():
        if max_count is None or count > max_count:
            key_max_count, max_count = key, count
    return key_max_count, max_count
    
def split_train_validation_single_author(book_to_paragraphs: dict, percentage_in_train:float) :
    books = list(book_to_paragraphs.keys())
    train = dict()
    validation = dict()
    for book, paragraphs in book_to_paragraphs.items():
        paragraphs_count = len(paragraphs)
        percentage_in_train_if_adding_to_train = (paragraph_count(train)+len(paragraphs))/max(paragraph_count(train)+paragraph_count(validation)+len(paragraphs),1)
        percentage_in_train_if_adding_to_validation = paragraph_count(train)/max(paragraph_count(train)+paragraph_count(validation)+len(paragraphs),1)
        if abs(percentage_in_train_if_adding_to_train-percentage_in_train)<abs(percentage_in_train_if_adding_to_validation-percentage_in_train):
            train[book] = paragraphs
        else:
            validation[book] = paragraphs
    return train, validation

def split_train_validation_all_authors(book_to_paragraphs_author1: dict, book_to_paragraphs_author2: dict, percentage_in_train:float) :
    train_author1,validation_author1 = split_train_validation_single_author(book_to_paragraphs_author1, percentage_in_train)
    train_author2,validation_author2 = split_train_validation_single_author(book_to_paragraphs_author2, percentage_in_train)

    target_length_validation = min(paragraph_count(validation_author1),paragraph_count(validation_author2))            
    validation_author1 = reduce_to_paragraph_count(validation_author1, target_length_validation)
    validation_author2 = reduce_to_paragraph_count(validation_author2, target_length_validation)

    target_length_train = min(paragraph_count(train_author1),paragraph_count(train_author2))            
    train_author1 = reduce_to_paragraph_count(train_author1, target_length_train)
    train_author2 = reduce_to_paragraph_count(train_author2, target_length_train)

    proportion_in_train = target_length_train/(target_length_train+target_length_validation)
    if proportion_in_train>percentage_in_train:
        target_length_train = int( (percentage_in_train/(1-percentage_in_train)) *target_length_validation )
        train_author1 = reduce_to_paragraph_count(train_author1, target_length_train)
        train_author2 = reduce_to_paragraph_count(train_author2, target_length_train)
    else:
        target_length_validation = int( ((1-percentage_in_train)/percentage_in_train) *target_length_train )
        validation_author1 = reduce_to_paragraph_count(validation_author1, target_length_validation)
        validation_author2 = reduce_to_paragraph_count(validation_author2, target_length_validation)
    return train_author1,validation_author1,train_author2,validation_author2

def calcul_accuracy(TP: int, TN: int, FP: int, FN: int):
    return (TP+TN)/max(TP+TN+FP+FN,1)

def calcul_accuracy_corneille(TP: int, TN: int, FP: int, FN: int):
    return TN/max(TN+FP,1)

def calcul_accuracy_moliere(TP: int, TN: int, FP: int, FN: int):
    return TP/max(TP+FN,1)

def calcul_accuracy(TP: int, TN: int, FP: int, FN: int):
    return (TP+TN)/max(TP+TN+FP+FN,1)

def compute_author_score_v1(text:str, most_common_words_for_author: Dict[str,float], use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            score += 1
    return score

def compute_author_score_v2(text:str, most_common_words_for_author: Dict[str,float], use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            score += 1+most_common_words_for_author[normalized_word]
    return score
    
def compute_author_score_single_author(text:str, most_common_words_for_author: Dict[str,float], use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    count = 0
    splitted_text = split_text(text)
    for original_word in splitted_text:
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            count += 1
    return count/len(splitted_text)

def compute_author_score_v2(text:str, most_common_words_for_author: Dict[str,float], use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            score += 1+most_common_words_for_author[normalized_word]
    return score

def compute_confusion_matrix_single_author(author_name:str, texts_from_author: List[str], text_from_other_authors: List[str], most_common_words_from_author: dict , hyperparameters: dict, verbose:bool) ->Tuple[int,int,int,int]:
    TP = 0 # y_true = author_name ,  y_pred = author_name
    TN = 0 # y_true = another author, y_pred = another author
    FN = 0 # y_true = author_name,   y_pred = another author
    FP = 0 # y_true = another_author, y_pred = author_name 
    threshold_author_score = hyperparameters['threshold_author_score']
    use_lowercase = hyperparameters['use_lowercase']
    use_diacritics = hyperparameters['use_diacritics']
    use_stemming = hyperparameters['use_stemming']
    for t in texts_from_author:
        score_author = compute_author_score_v1(t, most_common_words_from_author, use_lowercase, use_diacritics, use_stemming)
        if score_author>=threshold_author_score:
            if TP == 0 and verbose:
                print(f'\nExemple de TP (Texte de {author_name}, bien identifié, score {author_name}: {round(score_author,4)}):\n{t}\n')
            TP += 1
        else:
            if FN == 0 and verbose:
                print(f'\nExemple de FN (Texte de {author_name}, mal identifié, score {author_name}: {round(score_author,4)}):\n{t}\n')
            FN += 1
    for t in text_from_other_authors:
        score_author = compute_author_score_v1(t, most_common_words_from_author, use_lowercase, use_diacritics, use_stemming)
        if score_author>=threshold_author_score:
            if FP == 0 and verbose:
                print(f"\nExemple de FP (Texte d'un autre auteur, mal identifié, score {score_author}: {round(score_author,4)}):\n{t}\n")
            FP += 1
        else:
            if TN == 0 and verbose:
                print(f"\nExemple de TN (Texte d'un autre auteur, bien identifié, score {score_author}: {round(score_author,4)}):\n{t}\n")
            TN += 1
    return (TP,TN,FP,FN)
        
def train_single_author(hyperparameters: dict, author_directory: str, author_name, other_authors_directory: str,verbose: bool):
    random.seed(42)
    min_paragraph_length = hyperparameters['min_paragraph_length']
    author_dataset = load_all_books(author_directory, min_paragraph_length)
    other_authors_dataset = load_all_books(other_authors_directory, min_paragraph_length)
    if verbose: 
        print(f'\n{author_name} Dataset: {paragraph_count(author_dataset)} paragraphes ({word_count(author_dataset)} mots) venant de {len(author_dataset)} oeuvres:\n{list(author_dataset.keys())}')
        print(f'\nother_authors Dataset: {paragraph_count(other_authors_dataset)} paragraphes ({word_count(other_authors_dataset)} mots) venant de {len(other_authors_dataset)} oeuvres:\n{list(other_authors_dataset.keys())}')

    percentage_in_train = hyperparameters['percentage_in_train']
    train_author,validation_author,train_other_authors,validation_other_authors = split_train_validation_all_authors(author_dataset, other_authors_dataset, percentage_in_train)

    if verbose: 
        print(f'\n{author_name} Train Dataset: {paragraph_count(train_author)} paragraphes ({word_count(train_author)} mots) venant de {len(train_author)} oeuvres:\n{list(train_author.keys())}')
        print(f'\n{author_name} Validation Dataset: {paragraph_count(validation_author)} paragraphes ({word_count(validation_author)} mots) venant de {len(validation_author)} oeuvres:\n{list(validation_author.keys())}')
        print(f'\nother_authors Train Dataset: {paragraph_count(train_other_authors)} paragraphes ({word_count(train_other_authors)} mots) venant de {len(train_other_authors)} oeuvres:\n{list(train_other_authors.keys())}')
        print(f'\nother_authors Validation Dataset: {paragraph_count(validation_other_authors)} paragraphes ({word_count(validation_other_authors)} mots) venant de {len(validation_other_authors)} oeuvres:\n{list(validation_other_authors.keys())}')

    normalized_words_to_stats_author = compute_normalized_words_to_stats(all_paragraphs(train_author), hyperparameters)
    # we only keep the most common words
    most_common_author = compute_most_common_normalized_words(normalized_words_to_stats_author, hyperparameters['most_common_normalized_words_count'])
    (TP,TN,FP,FN) = compute_confusion_matrix_single_author(author_name,all_paragraphs(validation_author), all_paragraphs(validation_other_authors), most_common_author , hyperparameters, verbose)
    return (TP,TN,FP,FN,train_author,validation_author, validation_other_authors, normalized_words_to_stats_author, most_common_author)

def display_wordcloud(most_common_normalized_words: dict, normalized_words_to_stats: dict, title:str) -> None:
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    word_frequencies = dict()
    for normalized_word, frequency in most_common_normalized_words.items():
        original_word = get_max_item(normalized_words_to_stats[normalized_word][1])[0]
        if len(original_word) > 4:
            word_frequencies[original_word] = frequency
    wordcloud = WordCloud(width=1200, height=600, background_color='white').generate_from_frequencies(word_frequencies)
    # Display the generated word cloud
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=20)
    plt.axis('off')
    plt.show()
    


## Création d'un fichier de statistiques

In [None]:
hyperparameters = dict()
hyperparameters['use_lowercase'] = True
hyperparameters['use_diacritics'] = True
hyperparameters['use_stemming'] = True
hyperparameters['min_paragraph_length'] = 1000
hyperparameters['most_common_normalized_words_count'] = 1000

moliere_dataset = load_all_books(os.path.join(directory, 'moliere'), hyperparameters['min_paragraph_length'])
stats_moliere = compute_normalized_words_to_stats(all_paragraphs(moliere_dataset), hyperparameters)
corneille_dataset = load_all_books(os.path.join(directory, 'corneille'), hyperparameters['min_paragraph_length'])
stats_corneille = compute_normalized_words_to_stats(all_paragraphs(corneille_dataset), hyperparameters)

moliere_total_word_count = sum([c[0] for c in stats_moliere.values()])
corneille_total_word_count = sum([c[0] for c in stats_corneille.values()])

normalized_words = list((set(stats_moliere.keys())|set(stats_corneille.keys())))
normalized_words.sort()
moliere_normalized_word_count_in_percentage = []
moliere_normalized_word_count = []
corneille_normalized_word_count_in_percentage = []
corneille_normalized_word_count = []
moliere_most_common_original_word = []
moliere_most_common_original_word_count = []
corneille_most_common_original_word = []
corneille_most_common_original_word_count = []


for normalized_word in normalized_words:
    if normalized_word in stats_moliere:
        stat = stats_moliere[normalized_word]
        moliere_normalized_word_count_in_percentage.append(stat[0]/moliere_total_word_count)
        moliere_normalized_word_count.append(stat[0])
        most_common_word, most_common_word_count = get_max_item(stat[1])
        moliere_most_common_original_word.append(most_common_word)
        moliere_most_common_original_word_count.append(most_common_word_count)
    else:
        moliere_normalized_word_count_in_percentage.append(0)
        moliere_normalized_word_count.append(0)
        moliere_most_common_original_word.append(None)
        moliere_most_common_original_word_count.append(None)
    if normalized_word in stats_corneille:
        stat = stats_corneille[normalized_word]
        corneille_normalized_word_count_in_percentage.append(stat[0]/corneille_total_word_count)
        corneille_normalized_word_count.append(stat[0])
        most_common_word, most_common_word_count = get_max_item(stat[1])
        corneille_most_common_original_word.append(most_common_word)
        corneille_most_common_original_word_count.append(most_common_word_count)
    else:
        corneille_normalized_word_count_in_percentage.append(0)
        corneille_normalized_word_count.append(0)
        corneille_most_common_original_word.append(None)
        corneille_most_common_original_word_count.append(None)

fhr_stats = pd.DataFrame(
    {'normalized_words': normalized_words,
    'moliere_count_in_percentage': moliere_normalized_word_count_in_percentage,
    'moliere_count': moliere_normalized_word_count,
    'corneille_count_in_percentage' : corneille_normalized_word_count_in_percentage,
    'corneille_count' : corneille_normalized_word_count,
    'moliere_most_common_original_word' : moliere_most_common_original_word,
    'moliere_most_common_original_word_count' : moliere_most_common_original_word_count,
    'corneille_most_common_original_word' : corneille_most_common_original_word,
    'corneille_most_common_original_word_count' : corneille_most_common_original_word_count,
    })

# on sauvegarde ces stats sur le disque
fhr_stats.to_csv(os.path.join(directory, 'stylometrie_stats.csv'), index=False)         
        

# Identification des textes de Molière

## Mots les plus communs chez Molière

In [None]:
most_common_moliere = compute_most_common_normalized_words(stats_moliere, 1000)
display_wordcloud(most_common_moliere, stats_moliere, "Molière (uniquement les mots d'au moins 5 lettres)")
print("Mots (d'au moins 5 lettres) les plus fréquents chez Molière\n", "\n".join([str(c) for c in most_common_moliere.items() if len(c[0])>4][:20]))


## Exemple d'analyse pour un texte de Molière

In [None]:
import matplotlib.pyplot as plt

texte_moliere = moliere_dataset['les_fourberies_de_scapin'][1]
print('Texte à analyser\n\n',texte_moliere)
#split_text(texte_moliere)

most_common_moliere = compute_most_common_normalized_words(stats_moliere, hyperparameters['most_common_normalized_words_count'])

most_common_in_text = dict()
normalized_word_found_in_text = set()
splitted_text = split_text(texte_moliere)
normalized_word_found_in_text_count = 0
for original_word in splitted_text:
    normalized_word = normalize_word(original_word, hyperparameters['use_lowercase'], hyperparameters['use_diacritics'], hyperparameters['use_stemming'])
    if normalized_word not in most_common_moliere:
        continue
    normalized_word_found_in_text_count += 1
    if normalized_word in normalized_word_found_in_text:
        continue # word already processed
    normalized_word_found_in_text.add(normalized_word)
    most_common_in_text[original_word] = most_common_moliere[normalized_word]

most_common_in_text = sorted(most_common_in_text.items(), key = lambda x:x[1], reverse=True)       


#most_common_in_text = most_common_in_text[:100]
# Create the bar plot
plt.figure(figsize=(20, 5))
keys = [c[0] for c in most_common_in_text]
values = [c[1] for c in most_common_in_text]

plt.bar(keys, values, color='blue')

# Add title and labels
plt.title(f' {normalized_word_found_in_text_count} mots de ce texte (sur {len(splitted_text)}) font partie des mots les plus courants chez Molière', fontsize=15)
plt.xlabel('Mot', fontsize=15)
plt.ylabel('Fréquence du mot chez Molière', fontsize=15)
plt.xticks(rotation=45, fontsize=11)
plt.xlim(-0.5, len(keys) - 0.5)

# Display the plot
plt.show()


## Précision avec des paramètres par défaut

In [None]:
author_directory = os.path.join(directory, 'moliere')
author_name = "Molière"
other_authors_directory = os.path.join(directory, 'corneille')

verbose = False

hyperparameters = dict()
hyperparameters['min_paragraph_length'] = 1000
hyperparameters['percentage_in_train'] = 0.9
hyperparameters['use_lowercase'] = False
hyperparameters['use_diacritics'] = False
hyperparameters['use_stemming'] = False
hyperparameters['most_common_normalized_words_count'] = 1000

#la caractéristique à améliorer
hyperparameters['threshold_author_score'] = 130


(TP,TN,FP,FN,train_author,validation_author,validation_other_authors, normalized_words_to_stats_moliere, most_common_author) = train_single_author(hyperparameters, author_directory, author_name, other_authors_directory, verbose)
print()
print('-'*80)
print(f'Précision({author_name}): {round(calcul_accuracy(TP,TN,FP,FN),4)} ')
print('-'*80)


# Grid search pour améliorer cette précision

In [None]:
hyperparameters = dict()
hyperparameters['min_paragraph_length'] = 1000
hyperparameters['percentage_in_train'] = 0.9
hyperparameters['use_lowercase'] = False
hyperparameters['use_diacritics'] = True
hyperparameters['use_stemming'] = False
hyperparameters['most_common_normalized_words_count'] = 1000

# la caractéristique à améliorer
hyperparameters['threshold_author_score'] = 0

best_score = None
for most_common_normalized_words_count in [ hyperparameters['most_common_normalized_words_count'] ]:
    hyperparameters['most_common_normalized_words_count'] = most_common_normalized_words_count
    (TP,TN,FP,FN,train_author,validation_author,validation_other_authors, normalized_words_to_stats_moliere, most_common_author) = train_single_author(hyperparameters, author_directory, author_name, other_authors_directory, verbose)
    for threshold_author_score in range(100,200,1):
        hyperparameters['threshold_author_score'] = threshold_author_score
        (TP,TN,FP,FN) = compute_confusion_matrix_single_author(author_name,all_paragraphs(validation_author), all_paragraphs(validation_other_authors), most_common_author , hyperparameters, verbose)
        accuracy = calcul_accuracy(TP,TN,FP,FN)
        if accuracy>0.0 and (best_score is None or accuracy>best_score):
            best_score = accuracy
            print(f"threshold_author_score={hyperparameters['threshold_author_score']} => Précision({author_name})={round(accuracy,4)}")            
            
            

# Identification des textes de Corneille

## Mots les plus communs chez Corneille

In [None]:
most_common_corneille = compute_most_common_normalized_words(stats_corneille, 1000)
display_wordcloud(most_common_corneille, stats_corneille, "Corneille (uniquement les mots d'au moins 5 lettres)")
print("Mots (d'au moins 5 lettres) les plus fréquents chez Corneille\n", "\n".join([str(c) for c in most_common_corneille.items() if len(c[0])>5][:20]))


## Précision avec des paramètres par défaut

In [None]:
author_directory = os.path.join(directory, 'corneille')
author_name = "Corneille"
other_authors_directory = os.path.join(directory, 'moliere')

verbose = False

hyperparameters = dict()
hyperparameters['min_paragraph_length'] = 1000
hyperparameters['percentage_in_train'] = 0.9
hyperparameters['use_lowercase'] = False
hyperparameters['use_diacritics'] = False
hyperparameters['use_stemming'] = True
hyperparameters['most_common_normalized_words_count'] = 1000

#la caractéristique à améliorer
hyperparameters['threshold_author_score'] = 130


(TP,TN,FP,FN,train_author,validation_author,validation_other_authors, normalized_words_to_stats_moliere, most_common_author) = train_single_author(hyperparameters, author_directory, author_name, other_authors_directory, verbose)
print()
print('-'*80)
print(f'Précision({author_name}): {round(calcul_accuracy(TP,TN,FP,FN),4)} ')
print('-'*80)


# Grid search pour améliorer cette précision

In [None]:
hyperparameters = dict()
hyperparameters['min_paragraph_length'] = 1000
hyperparameters['percentage_in_train'] = 0.9
hyperparameters['use_lowercase'] = False
hyperparameters['use_diacritics'] = False
hyperparameters['use_stemming'] = True
hyperparameters['most_common_normalized_words_count'] = 1000

# la caractéristique à améliorer
hyperparameters['threshold_author_score'] = 0

author_directory = os.path.join(directory, 'corneille')
author_name = "Corneille"
other_authors_directory = os.path.join(directory, 'moliere')
best_score = None
for most_common_normalized_words_count in [ hyperparameters['most_common_normalized_words_count'] ]:
    hyperparameters['most_common_normalized_words_count'] = most_common_normalized_words_count
    (TP,TN,FP,FN,train_author,validation_author,validation_other_authors, normalized_words_to_stats_moliere, most_common_author) = train_single_author(hyperparameters, author_directory, author_name, other_authors_directory, verbose)
    for threshold_author_score in range(100,200,1):
        hyperparameters['threshold_author_score'] = threshold_author_score
        (TP,TN,FP,FN) = compute_confusion_matrix_single_author(author_name,all_paragraphs(validation_author), all_paragraphs(validation_other_authors), most_common_author , hyperparameters, verbose)
        accuracy = calcul_accuracy(TP,TN,FP,FN)
        if accuracy>0.0 and (best_score is None or accuracy>best_score):
            best_score = accuracy
            print(f"threshold_author_score={hyperparameters['threshold_author_score']} => Précision({auth})={round(accuracy,4)}")            
            
            

## Entraînement et calcul des métriques avec deux auteurs

In [None]:
def compute_author_score_v1(text:str, most_common_words_for_author: Dict[str,float], use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            score += 1
    return score

def compute_author_score_v2(text:str, most_common_words_for_author: Dict[str,float], use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            score += 1+most_common_words_for_author[normalized_word]
    return score
    
def compute_confusion_matrix_all_authors(text_author1: List[str], text_author2: List[str], most_common_words_author1: dict , most_common_words_author2: dict, hyperparameters: dict, verbose:bool) ->Tuple[int,int,int,int]:
    TP = 0 # y_true = Molière ,  y_pred = Molière
    TN = 0 # y_true = Corneille, y_pred = Corneille
    FN = 0 # y_true = Molière,   y_pred = Corneille
    FP = 0 # y_true = Corneille, y_pred = Molière 
    compute_author_score = hyperparameters['compute_author_score']
    use_lowercase = hyperparameters['use_lowercase']
    use_diacritics = hyperparameters['use_diacritics']
    use_stemming = hyperparameters['use_stemming']
    for t in text_author1:
        score_author1 = compute_author_score(t, most_common_words_author1, use_lowercase, use_diacritics, use_stemming)
        score_author2 = compute_author_score(t, most_common_words_author2, use_lowercase, use_diacritics, use_stemming)
        if score_author1>score_author2:
            if TP == 0 and verbose:
                print(f'\nExemple de TP (Texte de Molière, bien identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            TP += 1
        else:
            if FN == 0 and verbose:
                print(f'\nExemple de FN (Texte de Molière, mal identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            FN += 1
    for t in text_author2:
        score_author1 = compute_author_score(t, most_common_words_author1, use_lowercase, use_diacritics, use_stemming)
        score_author2 = compute_author_score(t, most_common_words_author2, use_lowercase, use_diacritics, use_stemming)
        if score_author1>score_author2:
            if FP == 0 and verbose:
                print(f'\nExemple de FP (Texte de Corneille, mal identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            FP += 1
        else:
            if TN == 0 and verbose:
                print(f'\nExemple de TN (Texte de Corneille, bien identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            TN += 1
    return (TP,TN,FP,FN)
        
    
def train_all_authors(hyperparameters: dict, verbose: bool):
    random.seed(42)
    min_paragraph_length = hyperparameters['min_paragraph_length']
    moliere_dataset = load_all_books(os.path.join(directory, 'moliere'), min_paragraph_length)
    corneille_dataset = load_all_books(os.path.join(directory, 'corneille'), min_paragraph_length)
    if verbose: 
        print(f'\nMoliere Dataset: {paragraph_count(moliere_dataset)} paragraphes ({word_count(moliere_dataset)} mots) venant de {len(moliere_dataset)} oeuvres:\n{list(moliere_dataset.keys())}')
        print(f'\nCorneille Dataset: {paragraph_count(corneille_dataset)} paragraphes ({word_count(corneille_dataset)} mots) venant de {len(corneille_dataset)} oeuvres:\n{list(corneille_dataset.keys())}')

    percentage_in_train = hyperparameters['percentage_in_train']
    train_moliere,validation_moliere,train_corneille,validation_corneille = split_train_validation_all_authors(moliere_dataset, corneille_dataset, percentage_in_train)

    if verbose: 
        print(f'\nMoliere Train Dataset: {paragraph_count(train_moliere)} paragraphes ({word_count(train_moliere)} mots) venant de {len(train_moliere)} oeuvres:\n{list(train_moliere.keys())}')
        print(f'\nMoliere Validation Dataset: {paragraph_count(validation_moliere)} paragraphes ({word_count(validation_moliere)} mots) venant de {len(validation_moliere)} oeuvres:\n{list(validation_moliere.keys())}')
        print(f'\nCorneille Train Dataset: {paragraph_count(train_corneille)} paragraphes ({word_count(train_corneille)} mots) venant de {len(train_corneille)} oeuvres:\n{list(train_corneille.keys())}')
        print(f'\nCorneille Validation Dataset: {paragraph_count(validation_corneille)} paragraphes ({word_count(validation_corneille)} mots) venant de {len(validation_corneille)} oeuvres:\n{list(validation_corneille.keys())}')

    normalized_words_to_stats_moliere = compute_normalized_words_to_stats(all_paragraphs(train_moliere), hyperparameters)
    normalized_words_to_stats_corneille = compute_normalized_words_to_stats(all_paragraphs(train_corneille), hyperparameters)

    # we only keep the most common words
    most_common_moliere = compute_most_common_normalized_words(normalized_words_to_stats_moliere, hyperparameters['most_common_normalized_words_count'])
    most_common_corneille = compute_most_common_normalized_words(normalized_words_to_stats_corneille, hyperparameters['most_common_normalized_words_count'])
    (TP,TN,FP,FN) = compute_confusion_matrix_all_authors(all_paragraphs(validation_moliere), all_paragraphs(validation_corneille), most_common_moliere , most_common_corneille, hyperparameters, verbose)
    return (TP,TN,FP,FN,train_moliere,validation_moliere,train_corneille,validation_corneille,normalized_words_to_stats_moliere, normalized_words_to_stats_corneille, most_common_moliere, most_common_corneille)

verbose = False

hyperparameters = dict()
hyperparameters['min_paragraph_length'] = 1000
hyperparameters['percentage_in_train'] = 0.9
hyperparameters['most_common_normalized_words_count'] = 1000
hyperparameters['use_lowercase'] = False
hyperparameters['use_diacritics'] = False
hyperparameters['use_stemming'] = False
hyperparameters['compute_author_score'] = compute_author_score_v1

#if verbose:    print(hyperparameters)
(TP,TN,FP,FN,train_moliere,validation_moliere,train_corneille,validation_corneille,normalized_words_to_stats_moliere, normalized_words_to_stats_corneille, most_common_moliere, most_common_corneille) = train_all_authors(hyperparameters, verbose)
print()
print('-'*80)
print(f'Précision(Molière ou Corneille?)={round(calcul_accuracy(TP,TN,FP,FN),4)}')
print('-'*80)


In [None]:
for normalized_word in ['monsieur', 'madam', 'plus', 'esprit', 'encor']:
    print(f"Le mot '{normalized_word}':")
    print(f'\test présent {stats_moliere[normalized_word][0]} fois chez Molière:  ', stats_moliere[normalized_word][1])
    print(f'\test présent {stats_corneille[normalized_word][0]} fois chez Corneille:', stats_corneille[normalized_word][1])


## Précision pour chaque oeuvre utilisée

In [None]:
print('-'*80+'\nPrécision pour chaque oeuvre de Moliere\n'+'-'*80)
accuracy_moliere = dict()
for book_path in all_txt_files_in_directory(os.path.join(directory, 'moliere')):
    (TP,TN,FP,FN) = compute_confusion_matrix_all_authors(split_book_into_paragraphs(book_path, hyperparameters['min_paragraph_length']), [], most_common_moliere , most_common_corneille, hyperparameters, False)
    #print(f"Accuracy '{pathlib.Path(book_path).stem}': {round(calcul_accuracy(TP,TN,FP,FN),4)}")
    accuracy_moliere[pathlib.Path(book_path).stem] = calcul_accuracy(TP,TN,FP,FN)
for e in sorted(accuracy_moliere.items(), key=lambda x: x[1]):
    print(e)

print()
print('-'*80+'\nPrécision pour chaque oeuvre de Corneille\n'+'-'*80)
accuracy_corneille = dict()
for book_path in all_txt_files_in_directory(os.path.join(directory, 'corneille')):
    (TP,TN,FP,FN) = compute_confusion_matrix_all_authors([], split_book_into_paragraphs(book_path, hyperparameters['min_paragraph_length']), most_common_moliere , most_common_corneille, hyperparameters, False)
    accuracy_corneille[pathlib.Path(book_path).stem] = calcul_accuracy(TP,TN,FP,FN)
for e in sorted(accuracy_corneille.items(), key=lambda x: x[1]):
    print(e)
    
    