# Le but de ce notebook est d'identifier des caractéristiques très simples permettant de différencier des textes écrits par plusieurs auteurs
## (en l'occurrence Molière et Corneille)

## Chargement des données

In [None]:
import heapq
import functools
from typing import List,Set,Tuple,Dict
import os
import pathlib
import re
import os
import random
import math
import pandas as pd

#!pip install unidecode
#!pip install wordcloud
#!pip install pillow

# le répertoire de travail
directory = os.path.abspath('')
random.seed(42)

# retourne tous les fichiers *.txt présents dans le repertoire 'path'
def all_txt_files_in_directory(path: str):
    return [os.path.join(path,f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.txt')]

# infique si la ligne 'ligne' est une ligne valide ou si elle doit être ignorée (par exemple si elle vide)
def is_valid_line(line: str) -> bool:
    if line.startswith('Scène ') or line.startswith('Acte '):
        return False
    if len(line)<10:
        return False
    return True

def split_book_into_paragraphs(path: str, min_paragraph_length) -> List[str]:
    result = []
    with open(path) as file:
        current_paragraph = ""
        for line in file:
            if is_valid_line(line):
                if current_paragraph :
                    current_paragraph += "\n"
                current_paragraph += line.rstrip()
                if len(current_paragraph) >= min_paragraph_length:
                    result.append(current_paragraph) 
                    current_paragraph = ""
    if len(current_paragraph) >= min_paragraph_length or (current_paragraph and len(result) == 0):
        result.append(current_paragraph) 
    return result

def load_all_books(path: str, min_paragraph_length:int) -> Dict[str,List[str]]:
    book_to_paragraphs = dict()
    for book_path in all_txt_files_in_directory(path):
        book_to_paragraphs[pathlib.Path(book_path).stem] = split_book_into_paragraphs(book_path, min_paragraph_length)
    return book_to_paragraphs



class MaxHeap:
    def __init__(self):
        self.item_counts = dict()
        self.total_count = 0
        self.max_heap = []
    def __str__(self):
        most_common_original_word, count = self.get_max_item()
        return f'{most_common_original_word} : {count}'
    def add_item(self, original_word):
        if original_word in self.item_counts:
            self.item_counts[original_word] += 1
        else:
            self.item_counts[original_word] = 1
        self.total_count += 1
        heapq.heappush(self.max_heap, (-self.item_counts[original_word], original_word))
        while self.max_heap and -self.max_heap[0][0] != self.item_counts[self.max_heap[0][1]]:
            heapq.heappop(self.max_heap)
    def most_common_original_word(self) -> str:
        return self.get_max_item()[0]
    def get_max_item(self):
        if self.max_heap:
            max_count, max_item = self.max_heap[0]
            return max_item, -max_count
        return None, 0


# pour supprimer les accents
@functools.lru_cache(maxsize=None)
def remove_diacritics(word: str) -> str:
    import unidecode  
    return unidecode.unidecode(word)

@functools.lru_cache(maxsize=None)
def to_lowercase(word: str) -> str:
    return word.lower()

# pour le stemming
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
@functools.lru_cache(maxsize=None)
def to_stemming(word: str) -> str:
    return stemmer.stem(word)

@functools.lru_cache(maxsize=None)
def normalize_word(word: str, use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> str:
    if use_lowercase:
        word = to_lowercase(word)
    if use_diacritics:
        word = remove_diacritics(word)
    if use_stemming:
        word = to_stemming(word)
    return word


   
# return the list of stopwords found in author1 & author2
def find_stopwords(normalized_words_to_stats_author1: dict, normalized_words_to_stats_author2: dict, hyperparameters: dict) -> Set[str]:
        stopwords = set()
        if not hyperparameters['use_stopword']:
            return stopwords
        stopword_min_frequency_in_percentage = hyperparameters['stopword_min_frequency_in_percentage'] 
        stopword_max_frequency_diff_in_percentage_between_authors = hyperparameters['stopword_max_frequency_diff_in_percentage_between_authors'] 
        stopword_max_length = hyperparameters['stopword_max_length'] 
        stopword_for_length_less_or_equal = hyperparameters['stopword_for_length_less_or_equal'] 
        
        if stopword_max_length<=0:
            return stopwords

        total_words_author1 = sum([c.total_count for c in normalized_words_to_stats_author1.values()])
        total_words_author2 = sum([c.total_count for c in normalized_words_to_stats_author2.values()])
        for normalized_word, stats in normalized_words_to_stats_author1.items():
            if normalized_word not in normalized_words_to_stats_author2:
                continue
                
            if len(stats.most_common_original_word()) <= stopword_for_length_less_or_equal:
                stopwords.add(normalized_word)
                continue
                
            if len(stats.most_common_original_word())>stopword_max_length:
                continue
            #if len(stats.most_common_original_word())<=3:
            #    stopwords.add(normalized_word)
            #    continue
            normalized_word_frequency_author1 = stats.total_count/total_words_author1
            normalized_word_frequency_author2 = normalized_words_to_stats_author2[normalized_word].total_count/total_words_author2
            if max(normalized_word_frequency_author1,normalized_word_frequency_author2)<stopword_min_frequency_in_percentage:
                continue
            frequency_diff_in_percentage_between_authors = 1-min(normalized_word_frequency_author1,normalized_word_frequency_author2)/max(normalized_word_frequency_author1,normalized_word_frequency_author2)
            if frequency_diff_in_percentage_between_authors>stopword_max_frequency_diff_in_percentage_between_authors:
                continue
            stopwords.add(normalized_word)
        return stopwords


def paragraph_count(book_to_paragraphs: Dict[str, List[str]]) -> int:
    if not book_to_paragraphs:
        return 0
    return sum([len(c) for c in book_to_paragraphs.values()])

def split_text(text:str) -> List[str]:
    return re.findall(r"\b[\w'^\d]+\b", text.rstrip())
            
def word_count(book_to_paragraphs: Dict[str, List[str]]) -> int:
    result = 0
    for paragraph in all_paragraphs(book_to_paragraphs):
        result += len(split_text(paragraph.rstrip()))
    return result

def all_paragraphs(book_to_paragraphs: Dict[str, List[str]]) -> List[str]:
    result = []
    for p in book_to_paragraphs.values():
        result.extend(p)
    return result

    

# reduce the dataset so that it contains exactly 'target_count' paragraphs
def reduce_to_paragraph_count(book_to_paragraphs: dict, target_count: int ) -> int:
    current_count = paragraph_count(book_to_paragraphs)
    if current_count<target_count:
        raise Exception(f'current_count {current_count} < target_count {target_count}')
    to_remove = current_count-  target_count
    result = dict()
    for book, paragraphs in sorted(book_to_paragraphs.items(), key =lambda x : len(x[1])):
        if len(paragraphs)<=to_remove:
            to_remove-=len(paragraphs)
            continue
        result[book] = paragraphs[:len(paragraphs)-to_remove]
        to_remove = 0
    return result


def split_train_validation_single_author(book_to_paragraphs: dict, percentage_in_train:float) :
    books = list(book_to_paragraphs.keys())
    train = dict()
    validation = dict()
    for book, paragraphs in book_to_paragraphs.items():
        paragraphs_count = len(paragraphs)
        percentage_in_train_if_adding_to_train = (paragraph_count(train)+len(paragraphs))/max(paragraph_count(train)+paragraph_count(validation)+len(paragraphs),1)
        percentage_in_train_if_adding_to_validation = paragraph_count(train)/max(paragraph_count(train)+paragraph_count(validation)+len(paragraphs),1)
        if abs(percentage_in_train_if_adding_to_train-percentage_in_train)<abs(percentage_in_train_if_adding_to_validation-percentage_in_train):
            train[book] = paragraphs
        else:
            validation[book] = paragraphs
    return train, validation
    
def split_train_validation_all_authors(book_to_paragraphs_author1: dict, book_to_paragraphs_author2: dict, percentage_in_train:float) :
    train_author1,validation_author1 = split_train_validation_single_author(book_to_paragraphs_author1, percentage_in_train)
    train_author2,validation_author2 = split_train_validation_single_author(book_to_paragraphs_author2, percentage_in_train)

    target_length_validation = min(paragraph_count(validation_author1),paragraph_count(validation_author2))            
    validation_author1 = reduce_to_paragraph_count(validation_author1, target_length_validation)
    validation_author2 = reduce_to_paragraph_count(validation_author2, target_length_validation)

    target_length_train = min(paragraph_count(train_author1),paragraph_count(train_author2))            
    train_author1 = reduce_to_paragraph_count(train_author1, target_length_train)
    train_author2 = reduce_to_paragraph_count(train_author2, target_length_train)

    proportion_in_train = target_length_train/(target_length_train+target_length_validation)
    if proportion_in_train>percentage_in_train:
        target_length_train = int( (percentage_in_train/(1-percentage_in_train)) *target_length_validation )
        train_author1 = reduce_to_paragraph_count(train_author1, target_length_train)
        train_author2 = reduce_to_paragraph_count(train_author2, target_length_train)
    else:
        target_length_validation = int( ((1-percentage_in_train)/percentage_in_train) *target_length_train )
        validation_author1 = reduce_to_paragraph_count(validation_author1, target_length_validation)
        validation_author2 = reduce_to_paragraph_count(validation_author2, target_length_validation)
    return train_author1,validation_author1,train_author2,validation_author2


def compute_normalized_words_to_stats(paragraphs: List[str], hyperparameters: dict) -> dict:
    use_lowercase = hyperparameters['use_lowercase']
    use_diacritics = hyperparameters['use_diacritics']
    use_stemming = hyperparameters['use_stemming']
    normalized_words_to_stats = dict()
    for paragraph in paragraphs:
        words = re.findall(r"\b[\w'^\d]+\b", paragraph.rstrip())
        for original_word in words:
            normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
            if normalized_word not in normalized_words_to_stats:
                normalized_words_to_stats[normalized_word] = MaxHeap()
            normalized_words_to_stats[normalized_word].add_item(original_word)    
    return normalized_words_to_stats



## Création d'un fichier de statistiques

In [None]:

hyperparameters = dict()
hyperparameters['min_paragraph_length'] = 150
hyperparameters['use_lowercase'] = True
hyperparameters['use_diacritics'] = True
hyperparameters['use_stemming'] = True

moliere_dataset = load_all_books(os.path.join(directory, 'moliere'), hyperparameters['min_paragraph_length'])
corneille_dataset = load_all_books(os.path.join(directory, 'corneille'), hyperparameters['min_paragraph_length'])


stats_moliere = compute_normalized_words_to_stats(all_paragraphs(moliere_dataset), hyperparameters)
stats_corneille = compute_normalized_words_to_stats(all_paragraphs(corneille_dataset), hyperparameters)

moliere_total_word_count = sum([c.total_count for c in stats_moliere.values()])
corneille_total_word_count = sum([c.total_count for c in stats_corneille.values()])

normalized_words = list((set(stats_moliere.keys())|set(stats_corneille.keys())))
normalized_words.sort()
moliere_normalized_word_count_in_percentage = []
moliere_normalized_word_count = []
corneille_normalized_word_count_in_percentage = []
corneille_normalized_word_count = []
moliere_most_common_original_word = []
moliere_most_common_original_word_count = []
corneille_most_common_original_word = []
corneille_most_common_original_word_count = []


for normalized_word in normalized_words:
    if normalized_word in stats_moliere:
        stat = stats_moliere[normalized_word]
        moliere_normalized_word_count_in_percentage.append(stat.total_count/moliere_total_word_count)
        moliere_normalized_word_count.append(stat.total_count)
        most_common_word, most_common_word_count = stat.get_max_item()
        moliere_most_common_original_word.append(most_common_word)
        moliere_most_common_original_word_count.append(most_common_word_count)
    else:
        moliere_normalized_word_count_in_percentage.append(0)
        moliere_normalized_word_count.append(0)
        moliere_most_common_original_word.append(None)
        moliere_most_common_original_word_count.append(None)
    if normalized_word in stats_corneille:
        stat = stats_corneille[normalized_word]
        corneille_normalized_word_count_in_percentage.append(stat.total_count/corneille_total_word_count)
        corneille_normalized_word_count.append(stat.total_count)
        most_common_word, most_common_word_count = stat.get_max_item()
        corneille_most_common_original_word.append(most_common_word)
        corneille_most_common_original_word_count.append(most_common_word_count)
    else:
        corneille_normalized_word_count_in_percentage.append(0)
        corneille_normalized_word_count.append(0)
        corneille_most_common_original_word.append(None)
        corneille_most_common_original_word_count.append(None)

fhr_stats = pd.DataFrame(
    {'normalized_words': normalized_words,
    'moliere_count_in_percentage': moliere_normalized_word_count_in_percentage,
    'moliere_count': moliere_normalized_word_count,
    'corneille_count_in_percentage' : corneille_normalized_word_count_in_percentage,
    'corneille_count' : corneille_normalized_word_count,
    'moliere_most_common_original_word' : moliere_most_common_original_word,
    'moliere_most_common_original_word_count' : moliere_most_common_original_word_count,
    'corneille_most_common_original_word' : corneille_most_common_original_word,
    'corneille_most_common_original_word_count' : corneille_most_common_original_word_count,
    })

# on sauvegarde ces stats sur le disque
fhr_stats.to_csv(os.path.join(directory, 'stylometrie_stats.csv'), index=False)         
        

## Entraînement et calcul des métriques

In [None]:

def calcul_f1_score(TP: int, TN: int, FP: int, FN: int):
    return (2*TP)/max(TP+FP+FN,1)
def calcul_accuracy_target_0(TP: int, TN: int, FP: int, FN: int):
    return TN/max(TN+FP,1)
def calcul_accuracy_target_1(TP: int, TN: int, FP: int, FN: int):
    return TP/max(TP+FN,1)
def calcul_accuracy(TP: int, TN: int, FP: int, FN: int):
    return (TP+TN)/max(TP+TN+FP+FN,1)


def delete_keys(data:dict, keys_to_delete) -> None:
    for c in keys_to_delete:
        if c in data:
            del data[c]

def compute_author_score_v1(text:str, most_common_words_for_author: dict, use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            score += 1
    return score

def compute_author_score_v2(text:str, most_common_words_for_author: dict, use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author:
            score += 1+most_common_words_for_author[normalized_word]
    return score

def compute_author_score_v3(text:str, most_common_words_for_author: dict, use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    used_words = set()
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author and normalized_word not in used_words:
            score += 1
            used_words.add(normalized_word)
    return score

def compute_author_score_v4(text:str, most_common_words_for_author: dict, use_lowercase: bool, use_diacritics: bool, use_stemming: bool) -> float:
    used_words = set()
    score = 0
    for original_word in split_text(text):
        normalized_word = normalize_word(original_word, use_lowercase, use_diacritics, use_stemming)
        if normalized_word in most_common_words_for_author and normalized_word not in used_words:
            score += 1+most_common_words_for_author[normalized_word]
            used_words.add(normalized_word)
    return score


def compute_confusion_matrix(text_author1: List[str], text_author2: List[str], most_common_words_author1: dict , most_common_words_author2: dict, hyperparameters: dict, verbose:bool) ->Tuple[int,int,int,int]:
    TP = 0 # y_true = Molière ,  y_pred = Molière
    TN = 0 # y_true = Corneille, y_pred = Corneille
    FN = 0 # y_true = Molière,   y_pred = Corneille
    FP = 0 # y_true = Corneille, y_pred = Molière 
    compute_author_score = hyperparameters['compute_author_score']
    use_lowercase = hyperparameters['use_lowercase']
    use_diacritics = hyperparameters['use_diacritics']
    use_stemming = hyperparameters['use_stemming']
    equalities = 0
    for t in text_author1:
        score_author1 = compute_author_score(t, most_common_words_author1, use_lowercase, use_diacritics, use_stemming)
        score_author2 = compute_author_score(t, most_common_words_author2, use_lowercase, use_diacritics, use_stemming)
        if score_author1 == score_author2: equalities +=1
        if score_author1>score_author2 or (score_author1==score_author2 and random.random() > 0.5):
            if TP == 0 and verbose:
                print(f'\nExemple de TP (Texte de Molière, bien identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            TP += 1
        else:
            if FN == 0 and verbose:
                print(f'\nExemple de FN (Texte de Molière, mal identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            FN += 1
    for t in text_author2:
        score_author1 = compute_author_score(t, most_common_words_author1, use_lowercase, use_diacritics, use_stemming)
        score_author2 = compute_author_score(t, most_common_words_author2, use_lowercase, use_diacritics, use_stemming)
        if score_author1 == score_author2: equalities +=1
        if score_author1>score_author2 or (score_author1==score_author2 and random.random() > 0.5):
            if FP == 0 and verbose:
                print(f'\nExemple de FP (Texte de Corneille, mal identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            FP += 1
        else:
            if TN == 0 and verbose:
                print(f'\nExemple de TN (Texte de Corneille, bien identifié, score Molière: {round(score_author1,4)}, score Corneille: {round(score_author2,4)}):\n{t}\n')
            TN += 1
    if verbose:
        print(f'Non différentiables: {equalities/max(len(text_author1)+len(text_author2),1)}')
    return (TP,TN,FP,FN)
        
    
def train(hyperparameters: dict, verbose: bool):
    random.seed(42)
    min_paragraph_length = hyperparameters['min_paragraph_length']
    moliere_dataset = load_all_books(os.path.join(directory, 'moliere'), min_paragraph_length)
    corneille_dataset = load_all_books(os.path.join(directory, 'corneille'), min_paragraph_length)
    if verbose: 
        print(f'\nMoliere Dataset: {paragraph_count(moliere_dataset)} paragraphes ({word_count(moliere_dataset)} mots) venant de {len(moliere_dataset)} oeuvres:\n{list(moliere_dataset.keys())}')
        print(f'\nCorneille Dataset: {paragraph_count(corneille_dataset)} paragraphes ({word_count(corneille_dataset)} mots) venant de {len(corneille_dataset)} oeuvres:\n{list(corneille_dataset.keys())}')

    percentage_in_train = hyperparameters['percentage_in_train']
    train_moliere,validation_moliere,train_corneille,validation_corneille = split_train_validation_all_authors(moliere_dataset, corneille_dataset, percentage_in_train)

    if verbose: 
        print(f'\nMoliere Train Dataset: {paragraph_count(train_moliere)} paragraphes ({word_count(train_moliere)} mots) venant de {len(train_moliere)} oeuvres:\n{list(train_moliere.keys())}')
        print(f'\nMoliere Validation Dataset: {paragraph_count(validation_moliere)} paragraphes ({word_count(validation_moliere)} mots) venant de {len(validation_moliere)} oeuvres:\n{list(validation_moliere.keys())}')
        print(f'\nCorneille Train Dataset: {paragraph_count(train_corneille)} paragraphes ({word_count(train_corneille)} mots) venant de {len(train_corneille)} oeuvres:\n{list(train_corneille.keys())}')
        print(f'\nCorneille Validation Dataset: {paragraph_count(validation_corneille)} paragraphes ({word_count(validation_corneille)} mots) venant de {len(validation_corneille)} oeuvres:\n{list(validation_corneille.keys())}')

    normalized_words_to_stats_moliere = compute_normalized_words_to_stats(all_paragraphs(train_moliere), hyperparameters)
    normalized_words_to_stats_corneille = compute_normalized_words_to_stats(all_paragraphs(train_corneille), hyperparameters)

    if hyperparameters['use_stopword']:
        stopwords = find_stopwords(normalized_words_to_stats_moliere, normalized_words_to_stats_corneille, hyperparameters)
        print(f'{len(stopwords)} mots vides : {list(stopwords)[:10]}')
        # we remove stopwords
        delete_keys(normalized_words_to_stats_moliere, stopwords)
        delete_keys(normalized_words_to_stats_corneille, stopwords)

    def most_common_normalized_words(normalized_words_to_stats: dict, count: int) -> Dict[str,float]:
        sorted_by_total_count = sorted(normalized_words_to_stats.items(), key=lambda item:item[1].total_count, reverse=True)[:count]
        total_count = sum([c[1].total_count for c in sorted_by_total_count])
        result  = dict()
        for normalied_word,stats in sorted_by_total_count:
            result[normalied_word] = stats.total_count/total_count
        return result


    most_common_normalized_words_count = hyperparameters['most_common_normalized_words_count']

    # we only keep the most common words
    most_common_moliere = most_common_normalized_words(normalized_words_to_stats_moliere, most_common_normalized_words_count)
    most_common_corneille = most_common_normalized_words(normalized_words_to_stats_corneille, most_common_normalized_words_count)
    (TP,TN,FP,FN) = compute_confusion_matrix(all_paragraphs(validation_moliere), all_paragraphs(validation_corneille), most_common_moliere , most_common_corneille, hyperparameters, verbose)
    return (TP,TN,FP,FN,train_moliere,validation_moliere,train_corneille,validation_corneille,normalized_words_to_stats_moliere, normalized_words_to_stats_corneille, most_common_moliere, most_common_corneille)

verbose = False

hyperparameters = dict()
hyperparameters['min_paragraph_length'] = 150
hyperparameters['percentage_in_train'] = 0.8
hyperparameters['most_common_normalized_words_count'] = 300
hyperparameters['use_lowercase'] = True
hyperparameters['use_diacritics'] = True
hyperparameters['use_stemming'] = True
hyperparameters['compute_author_score'] = compute_author_score_v1
hyperparameters['use_stopword'] = False
#hyperparameters['stopword_min_frequency_in_percentage'] = 0.001
#hyperparameters['stopword_max_frequency_diff_in_percentage_between_authors'] = 0.2
#hyperparameters['stopword_max_length'] = 2
#hyperparameters['stopword_for_length_less_or_equal'] = 0

#if verbose:    print(hyperparameters)
(TP,TN,FP,FN,train_moliere,validation_moliere,train_corneille,validation_corneille,normalized_words_to_stats_moliere, normalized_words_to_stats_corneille, most_common_moliere, most_common_corneille) = train(hyperparameters, verbose)
print()
print('-'*80)
print(f'Validation Accuracy: {round(calcul_accuracy(TP,TN,FP,FN),4)}  (Accuracy(Moliere): {round(calcul_accuracy_target_1(TP,TN,FP,FN),4)} / Accuracy(Corneille): {round(calcul_accuracy_target_0(TP,TN,FP,FN),4)})')
print('-'*80)


## Affichage des données sous la forme d'un nuage de mots-clés

In [None]:
def display_wordcloud(most_common_normalized_words: dict, normalized_words_to_stats_moliere: dict, title:str) -> None:
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    word_frequencies = dict()
    for normalized_word, frequency in most_common_normalized_words.items():
        original_word = normalized_words_to_stats_moliere[normalized_word].most_common_original_word()
        if len(original_word) > 4:
            word_frequencies[original_word] = frequency
    wordcloud = WordCloud(width=1200, height=600, background_color='white').generate_from_frequencies(word_frequencies)
    # Display the generated word cloud
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=20)
    plt.axis('off')
    plt.show()
    
display_wordcloud(most_common_moliere, normalized_words_to_stats_moliere, "Moliere (uniquement les mots d'au moins 5 lettres)")
display_wordcloud(most_common_corneille, normalized_words_to_stats_corneille, "Corneille (uniquement les mots d'au moins 5 lettres)")

In [None]:
print("Mots (d'au moins 5 lettres) les plus fréquents chez Moliere\n", "\n".join([str(c) for c in most_common_moliere.items() if len(c[0])>4][:20]))

In [None]:
print("Mots (d'au moins 5 lettres) les plus fréquents chez Corneille\n", "\n".join([str(c) for c in most_common_corneille.items() if len(c[0])>5][:20]))

In [None]:
for normalized_word in ['monsieur', 'madam', 'plus', 'esprit', 'encor']:
    print(f"Le mot '{normalized_word}':")
    print(f'\test présent {normalized_words_to_stats_moliere[normalized_word].total_count} fois chez Moliere:  ', normalized_words_to_stats_moliere[normalized_word].item_counts)
    print(f'\test présent {normalized_words_to_stats_corneille[normalized_word].total_count} fois chez Corneille:', normalized_words_to_stats_corneille[normalized_word].item_counts)


In [None]:
train_moliere['les_fourberies_de_scapin'][:5]

## Précision pour chaque oeuvre utilisée

In [None]:
print('-'*80+'\nPrécision pour chaque oeuvre de Moliere\n'+'-'*80)
accuracy_moliere = dict()
for book_path in all_txt_files_in_directory(os.path.join(directory, 'moliere')):
    (TP,TN,FP,FN) = compute_confusion_matrix(split_book_into_paragraphs(book_path, hyperparameters['min_paragraph_length']), [], most_common_moliere , most_common_corneille, hyperparameters, False)
    #print(f"Accuracy '{pathlib.Path(book_path).stem}': {round(calcul_accuracy(TP,TN,FP,FN),4)}")
    accuracy_moliere[pathlib.Path(book_path).stem] = calcul_accuracy(TP,TN,FP,FN)
for e in sorted(accuracy_moliere.items(), key=lambda x: x[1]): print(e)


    
print()
print('-'*80+'\nPrécision pour chaque oeuvre de Corneille\n'+'-'*80)
accuracy_corneille = dict()
for book_path in all_txt_files_in_directory(os.path.join(directory, 'corneille')):
    (TP,TN,FP,FN) = compute_confusion_matrix([], split_book_into_paragraphs(book_path, hyperparameters['min_paragraph_length']), most_common_moliere , most_common_corneille, hyperparameters, False)
    accuracy_corneille[pathlib.Path(book_path).stem] = calcul_accuracy(TP,TN,FP,FN)
for e in sorted(accuracy_corneille.items(), key=lambda x: x[1]): print(e)
    
    