# Le but de ce notebook est d'identifier des caractéristiques permettant de différencier des textes écrits par Molière et Corneille

# PRIVE: Hyperparamétres fixés par MathAData

In [None]:
# nombre de caractéres dans chaque paragraphe dont on veut identifier l'auteur
# 1000 caractéres correspond à environ une page de texte (environ 180 mots)
min_paragraph_length = 1000

# on divisie les données d'entraînement en 90% en train et 10% en validation
# une oeuvre doit être entièrement soit en train soit en validation
percentage_in_train = 0.9

# on ignore les majuscules / minuscules dans les mots: Monsieur == monsieur
use_lowercase = True

# on ignore les accents:  être == etre
use_diacritics = True

# cette option permet de ne tenir compte que de la racine du mot.
# elle est désactivée car elle dégrade nettement les performances
use_stemming = False

# nombre de mots signatures chez chaque auteur
# on ne regarde que les 'most_common_normalized_words_count' mots les plus courants chez chaque auteur
most_common_normalized_words_count = 50

# Mots vides. 
# Ils sont ignorés par l'outil car très communs à la fois chez Molière et chez Corneille
stop_words = set(["de","et","que","je","a","la","le","ne","ce","il","pour","un","qui","me","est","mais","des","moi","votre","qu'il","lui","du","fait","par","se","au","cette","sur","j'ai","avec","tous","vos","ces","n'est","peu","peut","quelque","dont","quoi","aux","donc","d'une","s'il","notre","sais","donne","vois","m'en","cet","autre","puis","assez","quel","veut","va","ils","doit","ont","vu", "en", "les", "vous", "mon","pas","si","plus", "tout", "nous", "ma", "sans", "ou", "c'est", "bien", "dans","une", "son","tu", "point", "mais", "mes", "d'un", "elle", "ses", "meme", "comme"])


# PRIVE: Méthodes utilisés dans le Notebook

In [None]:
import functools
from typing import List,Set,Tuple,Dict
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from scipy.interpolate import make_interp_spline
import os
import pathlib
import re
import os
import random
import math
import pandas as pd

#!pip install unidecode
#!pip install wordcloud
#!pip install pillow


# le répertoire de travail
directory = os.path.abspath('')
random.seed(42)

# retourne tous les fichiers *.txt présents dans le repertoire 'path'
def all_txt_files_in_directory(path: str):
    return [os.path.join(path,f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.txt')]

# infique si la ligne 'ligne' est une ligne valide ou si elle doit être ignorée (par exemple si elle vide)
def is_valid_line(line: str) -> bool:
    if line.startswith('Scène ') or line.startswith('Acte '):
        return False
    if len(line)<10:
        return False
    return True

def split_book_into_paragraphs(path: str) -> List[str]:
    result = []
    with open(path) as file:
        current_paragraph = ""
        for line in file:
            if is_valid_line(line):
                if current_paragraph :
                    current_paragraph += "\n"
                current_paragraph += line.rstrip()
                if len(current_paragraph) >= min_paragraph_length:
                    result.append(current_paragraph) 
                    current_paragraph = ""
    if len(current_paragraph) >= min_paragraph_length or (current_paragraph and len(result) == 0):
        result.append(current_paragraph) 
    return result

def load_all_books(path: str) -> Dict[str,List[str]]:
    book_to_paragraphs = dict()
    for book_path in all_txt_files_in_directory(path):
        book_to_paragraphs[pathlib.Path(book_path).stem] = split_book_into_paragraphs(book_path)
    return book_to_paragraphs

# pour supprimer les accents
@functools.lru_cache(maxsize=None)
def remove_diacritics(word: str) -> str:
    import unidecode  
    return unidecode.unidecode(word)

# mots en minuscules
@functools.lru_cache(maxsize=None)
def to_lowercase(word: str) -> str:
    return word.lower()

# pour le stemming
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
@functools.lru_cache(maxsize=None)
def to_stemming(word: str) -> str:
    return stemmer.stem(word)

@functools.lru_cache(maxsize=None)
def compute_normalized_word(word: str) -> str:
    if use_lowercase:
        word = to_lowercase(word)
    if use_diacritics:
        word = remove_diacritics(word)
    if use_stemming:
        word = to_stemming(word)
    return word

def paragraph_count(book_to_paragraphs: Dict[str, List[str]]) -> int:
    if not book_to_paragraphs:
        return 0
    return sum([len(c) for c in book_to_paragraphs.values()])

def split_text(text:str) -> List[str]:
    return re.findall(r"\b[\w'^\d]+\b", text.rstrip())
            
def word_count(book_to_paragraphs: Dict[str, List[str]]) -> int:
    result = 0
    for paragraph in all_paragraphs(book_to_paragraphs):
        result += len(split_text(paragraph.rstrip()))
    return result

def all_paragraphs(book_to_paragraphs: Dict[str, List[str]]) -> List[str]:
    result = []
    for p in book_to_paragraphs.values():
        result.extend(p)
    return result

# reduce the dataset so that it contains exactly 'target_count' paragraphs
def reduce_to_paragraph_count(book_to_paragraphs: dict, target_count: int ) -> int:
    current_count = paragraph_count(book_to_paragraphs)
    if current_count<target_count:
        raise Exception(f'current_count {current_count} < target_count {target_count}')
    to_remove = current_count-  target_count
    result = dict()
    for book, paragraphs in sorted(book_to_paragraphs.items(), key =lambda x : len(x[1])):
        if len(paragraphs)<=to_remove:
            to_remove-=len(paragraphs)
            continue
        result[book] = paragraphs[:len(paragraphs)-to_remove]
        to_remove = 0
    return result


def compute_most_common_normalized_words(normalized_words_to_stats: Dict[str, Tuple[int,Dict[str,int]] ], most_common_count:int) -> Dict[str,float]:
    sorted_by_total_count = sorted(normalized_words_to_stats.items(), key=lambda item:item[1][0], reverse=True)
    total_count = sum([stats[0] for normalized_word,stats in normalized_words_to_stats.items() if normalized_word not in stop_words])
    result  = dict()
    for normalized_word,stats in sorted_by_total_count:
        if normalized_word not in stop_words:
            result[normalized_word] = stats[0]/total_count
        if len(result)>=most_common_count:
            break
    return result

def compute_normalized_words_to_stats(paragraphs: List[str]) -> Dict[str, Tuple[int,Dict[str,int]] ]:
    normalized_words_to_stats = dict()
    for paragraph in paragraphs:
        words = split_text(paragraph)
        for original_word in words:
            normalized_word = compute_normalized_word(original_word)
            if normalized_word not in normalized_words_to_stats:
                normalized_words_to_stats[normalized_word] = (0, dict())
            count,original_word_count = normalized_words_to_stats[normalized_word]
            if original_word not in original_word_count:
                original_word_count[original_word] = 1
            else:
                original_word_count[original_word] += 1
            normalized_words_to_stats[normalized_word] = (count+1,original_word_count)
    return normalized_words_to_stats

def compute_normalized_word_to_original_word(text: str) -> Dict[str,str]:
    normalized_word_to_original_word = dict()
    splitted_text = split_text(text)
    for original_word in splitted_text:
        normalized_word = compute_normalized_word(original_word)
        if normalized_word in stop_words:
            continue
        normalized_word_to_original_word[normalized_word] = original_word
    return normalized_word_to_original_word

def get_max_item(dic: Dict[str, int]) -> Tuple[str,int]:
    key_max_count, max_count = (None, None)
    for key,count in dic.items():
        if max_count is None or count > max_count:
            key_max_count, max_count = key, count
    return key_max_count, max_count
    
def split_train_validation_single_author(book_to_paragraphs: dict, percentage_in_train:float) :
    books = list(book_to_paragraphs.keys())
    train = dict()
    validation = dict()
    for book, paragraphs in book_to_paragraphs.items():
        paragraphs_count = len(paragraphs)
        percentage_in_train_if_adding_to_train = (paragraph_count(train)+len(paragraphs))/max(paragraph_count(train)+paragraph_count(validation)+len(paragraphs),1)
        percentage_in_train_if_adding_to_validation = paragraph_count(train)/max(paragraph_count(train)+paragraph_count(validation)+len(paragraphs),1)
        if abs(percentage_in_train_if_adding_to_train-percentage_in_train)<abs(percentage_in_train_if_adding_to_validation-percentage_in_train):
            train[book] = paragraphs
        else:
            validation[book] = paragraphs
    return train, validation

def split_train_validation_all_authors(book_to_paragraphs_author1: dict, book_to_paragraphs_author2: dict, percentage_in_train:float) :
    train_author1,validation_author1 = split_train_validation_single_author(book_to_paragraphs_author1, percentage_in_train)
    train_author2,validation_author2 = split_train_validation_single_author(book_to_paragraphs_author2, percentage_in_train)

    target_length_validation = min(paragraph_count(validation_author1),paragraph_count(validation_author2))            
    validation_author1 = reduce_to_paragraph_count(validation_author1, target_length_validation)
    validation_author2 = reduce_to_paragraph_count(validation_author2, target_length_validation)

    target_length_train = min(paragraph_count(train_author1),paragraph_count(train_author2))            
    train_author1 = reduce_to_paragraph_count(train_author1, target_length_train)
    train_author2 = reduce_to_paragraph_count(train_author2, target_length_train)

    proportion_in_train = target_length_train/(target_length_train+target_length_validation)
    if proportion_in_train>percentage_in_train:
        target_length_train = int( (percentage_in_train/(1-percentage_in_train)) *target_length_validation )
        train_author1 = reduce_to_paragraph_count(train_author1, target_length_train)
        train_author2 = reduce_to_paragraph_count(train_author2, target_length_train)
    else:
        target_length_validation = int( ((1-percentage_in_train)/percentage_in_train) *target_length_train )
        validation_author1 = reduce_to_paragraph_count(validation_author1, target_length_validation)
        validation_author2 = reduce_to_paragraph_count(validation_author2, target_length_validation)
    return train_author1,validation_author1,train_author2,validation_author2

def calcul_accuracy(TP: int, TN: int, FP: int, FN: int):
    return (TP+TN)/max(TP+TN+FP+FN,1)

def calcul_accuracy_corneille(TP: int, TN: int, FP: int, FN: int):
    return TN/max(TN+FP,1)

def calcul_accuracy_moliere(TP: int, TN: int, FP: int, FN: int):
    return TP/max(TP+FN,1)

def calcul_accuracy(TP: int, TN: int, FP: int, FN: int):
    return (TP+TN)/max(TP+TN+FP+FN,1)

def compute_author_score(text:str, most_common_words_for_author: Dict[str,float]) -> float:
    score = 0
    for original_word in split_text(text):
        normalized_word = compute_normalized_word(original_word)
        if normalized_word in most_common_words_for_author:
            score += 1
    return score

def compute_confusion_matrix_single_author(author_name:str, texts_from_author: List[str], text_from_other_authors: List[str], most_common_words_from_author: dict , threshold_author_score: int, verbose:bool) ->Tuple[int,int,int,int]:
    TP = 0 # y_true = author_name ,  y_pred = author_name
    TN = 0 # y_true = another author, y_pred = another author
    FN = 0 # y_true = author_name,   y_pred = another author
    FP = 0 # y_true = another_author, y_pred = author_name 
    for t in texts_from_author:
        score_author = compute_author_score(t, most_common_words_from_author)
        if score_author>=threshold_author_score:
            if TP == 0 and verbose:
                print(f'\nExemple de TP (Texte de {author_name}, bien identifié, score {author_name}: {round(score_author,4)}):\n{t}\n')
            TP += 1
        else:
            if FN == 0 and verbose:
                print(f'\nExemple de FN (Texte de {author_name}, mal identifié, score {author_name}: {round(score_author,4)}):\n{t}\n')
            FN += 1
    for t in text_from_other_authors:
        score_author = compute_author_score(t, most_common_words_from_author)
        if score_author>=threshold_author_score:
            if FP == 0 and verbose:
                print(f"\nExemple de FP (Texte d'un autre auteur, mal identifié, score {score_author}: {round(score_author,4)}):\n{t}\n")
            FP += 1
        else:
            if TN == 0 and verbose:
                print(f"\nExemple de TN (Texte d'un autre auteur, bien identifié, score {score_author}: {round(score_author,4)}):\n{t}\n")
            TN += 1
    return (TP,TN,FP,FN)
        
def train_single_author(author_dataset, author_name, other_authors_dataset, threshold_author_score: int, verbose: bool = False) -> Tuple[int,int,int,int]:
    random.seed(42)
    if verbose: 
        print(f'\n{author_name} Dataset: {paragraph_count(author_dataset)} paragraphes ({word_count(author_dataset)} mots) venant de {len(author_dataset)} oeuvres:\n{list(author_dataset.keys())}')
        print(f'\nother_authors Dataset: {paragraph_count(other_authors_dataset)} paragraphes ({word_count(other_authors_dataset)} mots) venant de {len(other_authors_dataset)} oeuvres:\n{list(other_authors_dataset.keys())}')

    train_author,validation_author,train_other_authors,validation_other_authors = split_train_validation_all_authors(author_dataset, other_authors_dataset, percentage_in_train)

    if verbose: 
        print(f'\n{author_name} Train Dataset: {paragraph_count(train_author)} paragraphes ({word_count(train_author)} mots) venant de {len(train_author)} oeuvres:\n{list(train_author.keys())}')
        print(f'\n{author_name} Validation Dataset: {paragraph_count(validation_author)} paragraphes ({word_count(validation_author)} mots) venant de {len(validation_author)} oeuvres:\n{list(validation_author.keys())}')
        print(f'\nother_authors Train Dataset: {paragraph_count(train_other_authors)} paragraphes ({word_count(train_other_authors)} mots) venant de {len(train_other_authors)} oeuvres:\n{list(train_other_authors.keys())}')
        print(f'\nother_authors Validation Dataset: {paragraph_count(validation_other_authors)} paragraphes ({word_count(validation_other_authors)} mots) venant de {len(validation_other_authors)} oeuvres:\n{list(validation_other_authors.keys())}')

    train_normalized_words_to_stats_author = compute_normalized_words_to_stats(all_paragraphs(train_author))
    # we only keep the most common words
    train_most_common_author = compute_most_common_normalized_words(train_normalized_words_to_stats_author, most_common_normalized_words_count)
    return compute_confusion_matrix_single_author(author_name,all_paragraphs(validation_author), all_paragraphs(validation_other_authors), train_most_common_author , threshold_author_score, verbose)


def create_table_with_occurences_corneille_moliere(original_words: List[str]) -> pd.DataFrame:
    original_words.sort()
    occurences_moliere = []
    occurences_corneille = []
    for original_word in original_words:
        normalized_word = compute_normalized_word(original_word)
        occurences_moliere.append(stats_moliere[normalized_word][0] if normalized_word in stats_moliere else 0)
        occurences_corneille.append(stats_corneille[normalized_word][0] if normalized_word in stats_corneille else 0)
    df =pd.DataFrame(
        {'Mot': original_words,
        'Molière': occurences_moliere,
        'Corneille': occurences_corneille}    )
    df.set_index(['Mot'],inplace=True)    
    return df   

def display_wordcloud(most_common_normalized_words: dict, normalized_words_to_stats: Dict[str, Tuple[int,Dict[str,int]] ], title:str) -> None:
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    word_frequencies = dict()
    for normalized_word, frequency in most_common_normalized_words.items():
        original_word = get_max_item(normalized_words_to_stats[normalized_word][1])[0]
        if normalized_word not in stop_words:
            word_frequencies[original_word] = frequency
    wordcloud = WordCloud(width=1200, height=600, background_color='white').generate_from_frequencies(word_frequencies)
    # Display the generated word cloud
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=20)
    plt.axis('off')
    plt.show()
    
def display_plot(most_common_normalized_words: Dict[str,float], normalized_words_to_stats: Dict[str, Tuple[int,Dict[str,int]] ], author:str, top_k: int, display_in_percentage: bool):
    # Create the bar plot
    keys = []
    values = []
    for normalized_word, frequency in most_common_normalized_words.items():
        stats = normalized_words_to_stats[normalized_word]
        original_word = get_max_item(stats[1])[0]
        if normalized_word not in stop_words:
            keys.append(original_word)
            if display_in_percentage:
                values.append(frequency)
            else:
                values.append(stats[0])
            if len(keys)>=top_k:
                break
    title = f'Les {top_k} mots les plus communs chez {author}'
    display_key_values_plot(keys, values, title, display_in_percentage)

def display_dict_plot(dico, title: str, sort_dictionary:bool, display_in_percentage: bool):
    if sort_dictionary:
        dico = sorted(dico.items(), key = lambda x:x[1], reverse=True)       
    keys = [c[0] for c in dico]
    frequencies = [c[1] for c in dico]
    display_key_values_plot(keys, frequencies, title, display_in_percentage)
    
def display_key_values_plot(keys: List[str], values, title:str, display_in_percentage: bool):
    plt.figure(figsize=(20, 5))
    plt.bar(range(len(keys)), values, color='blue', tick_label=keys)
    # Add title and labels
    plt.title(title, fontsize=25)
    #plt.xlabel('Mot', fontsize=15)
    if display_in_percentage:
        plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
        plt.ylabel(f"Fréquence d'occurences", fontsize=25)
    else:
        plt.ylabel(f"Nombre d'occurences", fontsize=25)
    plt.gca().tick_params(axis='y', which='major', labelsize=15) 
    plt.xticks(rotation=90, fontsize=20)
    plt.xlim(-0.5, len(keys) - 0.5)
    # Display the plot
    plt.show()


## PRIVE: Chargement des données et création d'un fichier de statistiques

In [None]:
moliere_dataset = load_all_books(os.path.join(directory, 'moliere'))
stats_moliere = compute_normalized_words_to_stats(all_paragraphs(moliere_dataset))
most_common_moliere = compute_most_common_normalized_words(stats_moliere, most_common_normalized_words_count)
moliere_total_word_count = sum([c[0] for c in stats_moliere.values()])

corneille_dataset = load_all_books(os.path.join(directory, 'corneille'))
stats_corneille = compute_normalized_words_to_stats(all_paragraphs(corneille_dataset))
most_common_corneille = compute_most_common_normalized_words(stats_corneille, most_common_normalized_words_count)
corneille_total_word_count = sum([c[0] for c in stats_corneille.values()])

normalized_words = list((set(stats_moliere.keys())|set(stats_corneille.keys())))
normalized_words.sort()
moliere_normalized_word_count_in_percentage = []
moliere_normalized_word_count = []
corneille_normalized_word_count_in_percentage = []
corneille_normalized_word_count = []
moliere_most_common_original_word = []
moliere_most_common_original_word_count = []
corneille_most_common_original_word = []
corneille_most_common_original_word_count = []


for normalized_word in normalized_words:
    if normalized_word in stats_moliere:
        stat = stats_moliere[normalized_word]
        moliere_normalized_word_count_in_percentage.append(stat[0]/moliere_total_word_count)
        moliere_normalized_word_count.append(stat[0])
        most_common_word, most_common_word_count = get_max_item(stat[1])
        moliere_most_common_original_word.append(most_common_word)
        moliere_most_common_original_word_count.append(most_common_word_count)
    else:
        moliere_normalized_word_count_in_percentage.append(0)
        moliere_normalized_word_count.append(0)
        moliere_most_common_original_word.append(None)
        moliere_most_common_original_word_count.append(None)
    if normalized_word in stats_corneille:
        stat = stats_corneille[normalized_word]
        corneille_normalized_word_count_in_percentage.append(stat[0]/corneille_total_word_count)
        corneille_normalized_word_count.append(stat[0])
        most_common_word, most_common_word_count = get_max_item(stat[1])
        corneille_most_common_original_word.append(most_common_word)
        corneille_most_common_original_word_count.append(most_common_word_count)
    else:
        corneille_normalized_word_count_in_percentage.append(0)
        corneille_normalized_word_count.append(0)
        corneille_most_common_original_word.append(None)
        corneille_most_common_original_word_count.append(None)

fhr_stats = pd.DataFrame(
    {'normalized_words': normalized_words,
    'moliere_count_in_percentage': moliere_normalized_word_count_in_percentage,
    'moliere_count': moliere_normalized_word_count,
    'corneille_count_in_percentage' : corneille_normalized_word_count_in_percentage,
    'corneille_count' : corneille_normalized_word_count,
    'moliere_most_common_original_word' : moliere_most_common_original_word,
    'moliere_most_common_original_word_count' : moliere_most_common_original_word_count,
    'corneille_most_common_original_word' : corneille_most_common_original_word,
    'corneille_most_common_original_word_count' : corneille_most_common_original_word_count,
    })

# on sauvegarde ces stats sur le disque
fhr_stats.to_csv(os.path.join(directory, 'stylometrie_stats.csv'), index=False, encoding='utf-8-sig')



<hr style="height:2px; border-width:0; color:black; background-color:black">
<span style="font-size: 48px;">1ère partie</span>
<hr style="height:2px; border-width:0; color:black; background-color:black">

---
# Identification de textes de Molière
---

## Affichage des mots les plus communs chez Molière

### Nuage de mots-clés

In [None]:
display_wordcloud(most_common_moliere, stats_moliere, "Molière")

### Affichage d'un graphique par nombre d'occurences

In [None]:
display_plot(most_common_moliere, stats_moliere, "Molière", 50, False)

### Affichage d'un graphique  par fréquence d'occurence

In [None]:
display_plot(most_common_moliere, stats_moliere, "Molière", 50, True)

### PRIVE: Affichage des données brutes

In [None]:
print("Mots les plus fréquents chez Molière\n", "\n".join([f'({normalized_word}, {frequency})'for normalized_word,frequency in most_common_moliere.items() if normalized_word not in stop_words ]))

## Exemple d'analyse pour un texte de Molière (1ère proposition)

In [None]:
texte_a_analyser = moliere_dataset['les_fourberies_de_scapin'][1]
print('Texte à analyser')
print('')
print('-'*50)
print(texte_a_analyser)
print('-'*50)

normalized_word_to_original_word = compute_normalized_word_to_original_word(texte_a_analyser)
most_common_words_found_in_text = dict()
for normalized_word,original_word in normalized_word_to_original_word.items():
    if normalized_word in most_common_moliere:
        normalized_word_frequency = most_common_moliere[normalized_word]
        most_common_words_found_in_text[original_word] = normalized_word_frequency

title = f"{len(most_common_words_found_in_text)} mots différents de ce texte (contenant {len(normalized_word_to_original_word)} mots différents)\nfont partie des {most_common_normalized_words_count} mots les plus courants chez Molière"
display_dict_plot(most_common_words_found_in_text, title, True, True)        


## Exemple d'analyse pour un texte de Molière (2ème proposition)

In [None]:
texte_a_analyser = moliere_dataset['les_fourberies_de_scapin'][1]
print('Texte à analyser')
print('')
print('-'*50)
print(texte_a_analyser)
print('-'*50)

results = []
splitted = split_text(texte_a_analyser)
for original_word in splitted:
    normalized_word = compute_normalized_word(original_word)
    if normalized_word in stop_words:
        continue
    if normalized_word in most_common_moliere:
        normalized_word_frequency = most_common_moliere[normalized_word]
        results.append( (original_word,normalized_word_frequency) )
        
results = sorted(results, key=lambda x:x[1], reverse=True)        
keys = [ c[0] for c in results]
values = [ c[1] for c in results]

title = f"{len(results)} mots de ce texte (contenant {len(splitted)} mots)\nfont partie des {most_common_normalized_words_count} mots les plus courants chez Molière"
display_key_values_plot(keys, values, title, True)        


## Exemple d'analyse pour un texte de Molière (3ème proposition)

In [None]:
texte_a_analyser = moliere_dataset['les_fourberies_de_scapin'][1]
print('Texte à analyser')
print('')
print('-'*50)
print(texte_a_analyser)
print('-'*50)

        
most_common_words_found_in_text = dict()
texte_a_analyser_total_count = 0
texte_a_analyser_most_common_count = 0
splitted = split_text(texte_a_analyser)
for original_word in splitted:
    normalized_word = compute_normalized_word(original_word)
    if normalized_word in stop_words:
        continue
    texte_a_analyser_total_count += 1
    if normalized_word in most_common_moliere:
        normalized_word_frequency = most_common_moliere[normalized_word]
        most_common_words_found_in_text[original_word] = normalized_word_frequency
        texte_a_analyser_most_common_count += 1

title = f"{texte_a_analyser_most_common_count} mots de ce texte (contenant {len(splitted)} mots)\nfont partie des {most_common_normalized_words_count} mots les plus courants chez Molière\n(Certains mots peuvent apparaître plusieurs fois)"
display_dict_plot(most_common_words_found_in_text, title, True, True)     

## Affichage d'une courbe permettant de choisir la valeur optimale

In [None]:
'''
# commenté car long à tourner
valeurs_caracteristique = []
precision_caracteristique = []
for threshold_author in range(0,50+1,1):
    (TP,TN,FP,FN) = train_single_author(moliere_dataset, "Molière", corneille_dataset, threshold_author)
    accuracy = calcul_accuracy(TP,TN,FP,FN)
    valeurs_caracteristique.append(threshold_author)
    precision_caracteristique.append(accuracy)
    print(f"threshold_author={threshold_author} => Précision(Molière)={round(accuracy,4)}")
    
print(valeurs_caracteristique)
print(precision_caracteristique)
'''

# le code ci dessus (commenté) permet de calculer les valeurs suivantes
valeurs_caracteristique = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
precision_caracteristique = [0.5, 0.5, 0.5, 0.5033557046979866, 0.5033557046979866, 0.5067114093959731, 0.5134228187919463, 0.5335570469798657, 0.5570469798657718, 0.6073825503355704, 0.6308724832214765, 0.6610738255033557, 0.7114093959731543, 0.7315436241610739, 0.7214765100671141, 0.7416107382550335, 0.761744966442953, 0.7449664429530202, 0.7114093959731543, 0.6677852348993288, 0.6442953020134228, 0.6241610738255033, 0.5771812080536913, 0.5570469798657718, 0.5469798657718121, 0.5469798657718121, 0.5234899328859061, 0.5100671140939598, 0.5067114093959731, 0.5067114093959731, 0.5033557046979866, 0.5033557046979866, 0.5033557046979866, 0.5033557046979866, 0.5033557046979866, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline


x_dense = np.linspace(min(valeurs_caracteristique), max(valeurs_caracteristique), 500)  # 500 points pour une courbe lisse
spline = make_interp_spline(valeurs_caracteristique, precision_caracteristique)
y_dense = spline(x_dense)

plt.figure(figsize=(20, 10))
plt.plot(x_dense, y_dense, label='Précision', color='b')
plt.scatter(valeurs_caracteristique, precision_caracteristique, color='r')
plt.gca().tick_params(axis='y', which='major', labelsize=20) 
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(fontsize=20)
plt.xlabel('Nombres minimum de mots signatures de Molière dans un texte pour le considérer comme écrit par Molière', fontsize=20)
plt.ylabel('Précision', fontsize=20)
plt.title('Evolution de la précision en fonction de la valeur de la caractéristique', fontsize=20)
plt.legend()


## On propose à l'étudiant de choisir la valeur de la caractéristique

In [None]:
#la caractéristique à améliorer
# ce '10' sera modifé par l'étudiant
threshold_author = 10

(TP,TN,FP,FN) = train_single_author(moliere_dataset, "Molière", corneille_dataset, threshold_author)
print(f'Précision(Molière) avec threshold_author={threshold_author}: {round(calcul_accuracy(TP,TN,FP,FN),4)} ')

<hr style="height:2px; border-width:0; color:black; background-color:black">
<span style="font-size: 48px;">2ème partie</span>
<hr style="height:2px; border-width:0; color:black; background-color:black">

---
# Identification de textes de Corneille
---

## Mots les plus communs chez Corneille

### Nuage de mots-clés

In [None]:
display_wordcloud(most_common_corneille, stats_corneille, "Corneille")

### Affichage d'un graphique par nombre d'occurences

In [None]:
display_plot(most_common_corneille, stats_corneille, "Corneille", 50, True)

### Affichage d'un graphique  par fréquence d'occurence

In [None]:
print("Mots les plus fréquents chez Corneille\n", "\n".join([f'({normalized_word}, {frequency})'for normalized_word,frequency in most_common_corneille.items() if normalized_word not in stop_words ]))

## Exemple d'analyse pour un texte de Corneille

In [None]:
texte_a_analyser = corneille_dataset['le_cid'][1]
print('Texte à analyser')
print('')
print('-'*50)
print(texte_a_analyser)
print('-'*50)


normalized_word_to_original_word = compute_normalized_word_to_original_word(texte_a_analyser)
most_common_words_found_in_text = dict()
for normalized_word,original_word in normalized_word_to_original_word.items():
    if normalized_word in most_common_moliere:
        most_common_words_found_in_text[original_word] = most_common_moliere[normalized_word]

title = f"{len(most_common_words_found_in_text)} mots de ce texte (contenant {len(normalized_word_to_original_word)} mots différents)\nfont partie des {most_common_normalized_words_count} mots les plus courants chez Corneille"
display_dict_plot(most_common_words_found_in_text, title, True, True)      

## Affichage d'une courbe permettant de choisir la valeur optimale

In [None]:
# commenté car long à tourner
'''
valeurs_caracteristique = []
precision_caracteristique = []
for threshold_author in range(0,50+1,1):
    (TP,TN,FP,FN) = train_single_author(corneille_dataset, "Corneille", moliere_dataset, threshold_author)
    accuracy = calcul_accuracy(TP,TN,FP,FN)
    valeurs_caracteristique.append(threshold_author)
    precision_caracteristique.append(accuracy)
    print(f"threshold_author={threshold_author} => Précision(Corneille)={round(accuracy,4)}")
print(valeurs_caracteristique)
print(precision_caracteristique)
'''
# le code ci dessus (commenté) permet de calculer les valeurs suivantes
valeurs_caracteristique = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
precision_caracteristique = [0.5, 0.5, 0.5, 0.5033557046979866, 0.5100671140939598, 0.5234899328859061, 0.5503355704697986, 0.6040268456375839, 0.6375838926174496, 0.6644295302013423, 0.7214765100671141, 0.7483221476510067, 0.7651006711409396, 0.7885906040268457, 0.7885906040268457, 0.7718120805369127, 0.7483221476510067, 0.7214765100671141, 0.6812080536912751, 0.6375838926174496, 0.5973154362416108, 0.5805369127516778, 0.5436241610738255, 0.5234899328859061, 0.5167785234899329, 0.5134228187919463, 0.5067114093959731, 0.5067114093959731, 0.5067114093959731, 0.5033557046979866, 0.5033557046979866, 0.5033557046979866, 0.5033557046979866, 0.5033557046979866, 0.5033557046979866, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]


x_dense = np.linspace(min(valeurs_caracteristique), max(valeurs_caracteristique), 500)  # 500 points pour une courbe lisse
spline = make_interp_spline(valeurs_caracteristique, precision_caracteristique)
y_dense = spline(x_dense)

plt.figure(figsize=(20, 10))
plt.plot(x_dense, y_dense, label='Précision', color='b')
plt.scatter(valeurs_caracteristique, precision_caracteristique, color='r')
plt.gca().tick_params(axis='y', which='major', labelsize=20) 
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(fontsize=20)
plt.xlabel('Nombres minimum de mots signatures de Molière dans un texte pour le considérer comme écrit par Corneille', fontsize=20)
plt.ylabel('Précision', fontsize=20)
plt.title('Evolution de la précision en fonction de la valeur de la caractéristique', fontsize=20)
plt.legend()
        

## On propose à l'étudiant de choisir la valeur de la caractéristique

In [None]:
#la caractéristique à améliorer
# ce '10' sera modifé par l'étudiant
threshold_author = 10

(TP,TN,FP,FN) = train_single_author(corneille_dataset, "Corneille", moliere_dataset, threshold_author)
print(f'Précision(Corneille) avec threshold_author={threshold_author}: {round(calcul_accuracy(TP,TN,FP,FN),4)} ')

<hr style="height:2px; border-width:0; color:black; background-color:black">
<span style="font-size: 48px;">3ème partie</span>
<hr style="height:2px; border-width:0; color:black; background-color:black">

---
# Différencier des textes de Molière et de Corneille
---

# PRIVE: outil de recherche de textes très spécifiques à Molière ou Corneille

In [None]:
def compute_word_score_moliere_vs_corneille(count_moliere:int, total_words_moliere:int, count_corneille:int, total_words_corneille:int ):
    if count_moliere+count_corneille<30:
        return 0
    if count_moliere == 0:
        return -1
    if count_corneille == 0:
        return 1
    percentage_moliere = count_moliere/total_words_moliere
    percentage_corneille = count_corneille/total_words_corneille
    if percentage_moliere>2*percentage_corneille:
        return 1
    if percentage_corneille>2*percentage_moliere:
        return -1
    return 0
    


def find_most_distinctive_lines(is_moliere: bool) -> None:
    min_score_moliere = 0
    max_score_moliere = 0
    
    author_dataset = moliere_dataset if is_moliere else corneille_dataset
    
    for book_name, paragraphs in author_dataset.items():
        for paragraph in paragraphs:
            for line in paragraph.splitlines():
                words = split_text(line)
                if len(words)<5:
                    continue
                line_score_moliere_vs_corneille = 0
                comment_moliere = ""
                comment_corneille = ""
                for original_word in words:
                    normalized_word = compute_normalized_word(original_word)
                    count_moliere = stats_moliere[normalized_word][0] if normalized_word in stats_moliere else 0
                    count_corneille = stats_corneille[normalized_word][0] if normalized_word in stats_corneille else 0
                    word_score_moliere_vs_corneille = compute_word_score_moliere_vs_corneille(count_moliere, moliere_total_word_count, count_corneille, corneille_total_word_count)
                    if word_score_moliere_vs_corneille == 0:
                        continue
                    if is_moliere:
                        comment = f"{original_word} ({count_moliere} vs {count_corneille}) "
                    else:
                        comment = f"{original_word} ({count_corneille} vs {count_moliere}) "
                    if word_score_moliere_vs_corneille>0:
                        comment_moliere += comment
                    else:
                        comment_corneille += comment
                    line_score_moliere_vs_corneille += word_score_moliere_vs_corneille
                if is_moliere and comment_corneille:
                    continue
                if not is_moliere and comment_moliere:
                    continue
                if abs(line_score_moliere_vs_corneille)>=5 and (len(comment_moliere)==0 or len(comment_corneille)==0):
                #if total_score_moliere<min_score_moliere or line_score_moliere_vs_corneille>max_score_moliere:
                    min_score_moliere = min(min_score_moliere,line_score_moliere_vs_corneille)
                    max_score_moliere = max(max_score_moliere,line_score_moliere_vs_corneille)
                    print('-'*50)
                    print(f"Oeuvre de {'Molière' if is_moliere else 'Corneille'}: {book_name}")
                    print(line)
                    print(f'Score: {line_score_moliere_vs_corneille}')
                    if comment_moliere:
                        print(f'avantage Molière: {comment_moliere}')
                    if comment_corneille:
                        print(f'avantage Corneille: {comment_corneille}')
                    print('-'*50)
                    print()

'''
Exemples de textes trouvés par cet outil:

--------------------------------------------------
Oeuvre de Molière: le_malade_imaginaire
Qu'il se fasse médecin, je consens au mariage. Oui, faites-vous médecin, je vous donne ma fille.
Score: 5
avantage Molière: médecin (208 vs 1) mariage (181 vs 24) Oui (849 vs 165) médecin (208 vs 1) fille (415 vs 128) 

--------------------------------------------------
Oeuvre de Corneille: polyeucte
Ton courage était bon, ton devoir l'a trahi.
Score: -7
avantage Corneille: Ton (759 vs 224) courage (206 vs 44) était (86 vs 1) ton (759 vs 224) devoir (182 vs 69) l'a (121 vs 72) trahi (22 vs 8) 
'''                    
                    
                
print('-'*50)
print('Recherche de lignes spécifiques à Molière')
find_most_distinctive_lines(True)
print()
print('-'*50)
print('Recherche de lignes spécifiques à Corneille')
find_most_distinctive_lines(False)
print()
    


## On montre des exemples où les deux auteurs font une utilisation très différente de certains mots

## Nombre d'occurences de certains mots dans l'oeuvre de Molière et Corneille

In [None]:
mots = ['courage', 'devoir', 'fille', 'madame', 'monsieur', 'médecin', 'oui', 'reine', 'trahi', 'trone']
for word in mots:
    normalized_word = compute_normalized_word(word)
    print(f"Le mot '{word}':")
    print(f'\test présent {stats_moliere[normalized_word][0]} fois chez Molière:  ', stats_moliere[normalized_word][1])
    print(f'\test présent {stats_corneille[normalized_word][0]} fois chez Corneille:', stats_corneille[normalized_word][1])


create_table_with_occurences_corneille_moliere(mots)

## En se basant sur le tableau d'occurences ci dessus, qui de Molière ou Corneille a probalement écrit cette ligne:
### "Oui, faites-vous médecin, je vous donne ma fille."

In [None]:
# remplacer le "XXX" ci dessous par "Molière" ou par "Corneille"
auteur = "XXX"

## En se basant sur le tableau d'occurences ci dessus, qui de Molière ou Corneille a probalement écrit cette ligne:
### "Ton courage était bon, ton devoir l'a trahi."

In [None]:
# remplacer le "XXX" ci dessous par "Molière" ou par "Corneille"
auteur = "XXX"

## PRIVE: Entraînement et calcul des métriques avec deux auteurs

In [None]:

def compute_confusion_matrix_all_authors(text_moliere: List[str], text_corneille: List[str], most_common_words_moliere: dict , most_common_words_corneille: dict, verbose:bool = False) ->Tuple[int,int,int,int]:
    TP = 0 # y_true = Molière ,  y_pred = Molière
    TN = 0 # y_true = Corneille, y_pred = Corneille
    FN = 0 # y_true = Molière,   y_pred = Corneille
    FP = 0 # y_true = Corneille, y_pred = Molière 
    for t in text_moliere:
        score_moliere = compute_author_score(t, most_common_words_moliere)
        score_corneille = compute_author_score(t, most_common_words_corneille)
        if score_moliere>score_corneille:
            if TP == 0 and verbose:
                print(f'\nExemple de TP (Texte de Molière, bien identifié, score Molière: {round(score_moliere,4)}, score Corneille: {round(score_corneille,4)}):\n{t}\n')
            TP += 1
        else:
            if FN == 0 and verbose:
                print(f'\nExemple de FN (Texte de Molière, mal identifié, score Molière: {round(score_moliere,4)}, score Corneille: {round(score_corneille,4)}):\n{t}\n')
            FN += 1
    for t in text_corneille:
        score_moliere = compute_author_score(t, most_common_words_moliere)
        score_corneille = compute_author_score(t, most_common_words_corneille)
        if score_moliere>score_corneille:
            if FP == 0 and verbose:
                print(f'\nExemple de FP (Texte de Corneille, mal identifié, score Molière: {round(score_moliere,4)}, score Corneille: {round(score_corneille,4)}):\n{t}\n')
            FP += 1
        else:
            if TN == 0 and verbose:
                print(f'\nExemple de TN (Texte de Corneille, bien identifié, score Molière: {round(score_moliere,4)}, score Corneille: {round(score_corneille,4)}):\n{t}\n')
            TN += 1
    return (TP,TN,FP,FN)
        
    
def train_all_authors(most_common_count, verbose: bool = False):
    random.seed(42)
    if verbose: 
        print(f'\nMoliere Dataset: {paragraph_count(moliere_dataset)} paragraphes ({word_count(moliere_dataset)} mots) venant de {len(moliere_dataset)} oeuvres:\n{list(moliere_dataset.keys())}')
        print(f'\nCorneille Dataset: {paragraph_count(corneille_dataset)} paragraphes ({word_count(corneille_dataset)} mots) venant de {len(corneille_dataset)} oeuvres:\n{list(corneille_dataset.keys())}')

    train_moliere,validation_moliere,train_corneille,validation_corneille = split_train_validation_all_authors(moliere_dataset, corneille_dataset, percentage_in_train)

    if verbose: 
        print(f'\nMoliere Train Dataset: {paragraph_count(train_moliere)} paragraphes ({word_count(train_moliere)} mots) venant de {len(train_moliere)} oeuvres:\n{list(train_moliere.keys())}')
        print(f'\nMoliere Validation Dataset: {paragraph_count(validation_moliere)} paragraphes ({word_count(validation_moliere)} mots) venant de {len(validation_moliere)} oeuvres:\n{list(validation_moliere.keys())}')
        print(f'\nCorneille Train Dataset: {paragraph_count(train_corneille)} paragraphes ({word_count(train_corneille)} mots) venant de {len(train_corneille)} oeuvres:\n{list(train_corneille.keys())}')
        print(f'\nCorneille Validation Dataset: {paragraph_count(validation_corneille)} paragraphes ({word_count(validation_corneille)} mots) venant de {len(validation_corneille)} oeuvres:\n{list(validation_corneille.keys())}')

    train_normalized_words_to_stats_moliere = compute_normalized_words_to_stats(all_paragraphs(train_moliere))
    train_normalized_words_to_stats_corneille = compute_normalized_words_to_stats(all_paragraphs(train_corneille))

    # we only keep the most common words
    train_most_common_moliere = compute_most_common_normalized_words(train_normalized_words_to_stats_moliere, most_common_count)
    train_most_common_corneille = compute_most_common_normalized_words(train_normalized_words_to_stats_corneille, most_common_count)
    return compute_confusion_matrix_all_authors(all_paragraphs(validation_moliere), all_paragraphs(validation_corneille), train_most_common_moliere , train_most_common_corneille, verbose)



# Description d'une 1ère méthode pour différencies des textes écrits par Molière et Corneille:
## Pour chaque texte à identifier:
### - On compte le nombre de mots 'signatures' de chaque auteur présents dans le texte.
    (Un mot signature est l'un des 50 mots les plus courants de cet auteur.)
### - On attribue le texte à l'auteur ayant le plus grand nombre de mots signatures dans le texte.

## Résultats de cette 1ère méthode

In [None]:
(TP,TN,FP,FN) = train_all_authors(most_common_normalized_words_count)
print(f'Précision(Molière ou Corneille?)= {round(100*calcul_accuracy(TP,TN,FP,FN),1)}%')


## Pour améliorer ces résultats, on peut proposer à l'étudiant de modifier le nombre de mots signatures associés à chaque auteur

### On affiche à l'étudant la valeur de la précision pour différentes valeurs du nombre de mots signatures

In [None]:
# commenté car long à tourner
'''
most_common_counts = [50,100,200]+list(range(500,20000,500))
accuracy_for_most_common_counts = []
for most_common_count in most_common_counts:
    (TP,TN,FP,FN) = train_all_authors(most_common_count)
    accuracy = calcul_accuracy(TP,TN,FP,FN)
    print(f'Précision(Molière ou Corneille?) if most_common_count={most_common_count} = {round(accuracy,4)}')
    accuracy_for_most_common_counts.append(accuracy)
print(most_common_counts)
print(accuracy_for_most_common_counts)
'''                                         
# le code ci dessus (commenté) permet de calculer les valeurs suivantes
most_common_counts = [50, 100, 200, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500]
accuracy_for_most_common_counts = [0.9261744966442953, 0.9328859060402684, 0.9362416107382551, 0.9664429530201343, 0.9697986577181208, 0.9865771812080537, 0.9865771812080537, 0.9899328859060402, 0.9966442953020134, 0.9966442953020134, 0.9899328859060402, 0.9798657718120806, 0.9765100671140939, 0.9798657718120806, 0.9731543624161074, 0.9765100671140939, 0.9765100671140939, 0.9731543624161074, 0.9765100671140939, 0.9798657718120806, 0.9765100671140939, 0.9697986577181208, 0.9563758389261745, 0.9429530201342282, 0.9563758389261745, 0.9496644295302014, 0.9496644295302014, 0.9463087248322147, 0.9362416107382551, 0.9295302013422819, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551, 0.9362416107382551]

x_dense = np.linspace(min(most_common_counts), max(most_common_counts), 500)  # 500 points pour une courbe lisse
spline = make_interp_spline(most_common_counts, accuracy_for_most_common_counts)
y_dense = spline(x_dense)

plt.figure(figsize=(20, 10))
plt.plot(x_dense, y_dense, label='Précision', color='b')
plt.scatter(most_common_counts, accuracy_for_most_common_counts, color='r')
plt.gca().tick_params(axis='y', which='major', labelsize=20) 
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(fontsize=20)
plt.xlabel('Nombres de mots signatures chez chaque auteur', fontsize=20)
plt.ylabel('Précision pour distinguer des oeuvres de Molière et Corneille', fontsize=20)
plt.title('Evolution de la précision en fonction du nombre de mots signatures chez chaque auteur', fontsize=20)
plt.legend()
            
    

## On propose à l'étudiant de choisir la valeur de la caractéristique

In [None]:
#la caractéristique à améliorer
# ce '50' sera modifé par l'étudiant
nombre_de_mots_signatures_chez_chaque_auteur = 50

(TP,TN,FP,FN) = train_all_authors(nombre_de_mots_signatures_chez_chaque_auteur)
print()
print('-'*80)
print(f'Précision(Molière ou Corneille?)={round(100*calcul_accuracy(TP,TN,FP,FN),1)}%')
print('-'*80)

## PRIVE: Précision pour chaque oeuvre utilisée

In [None]:
print('-'*80+'\nPrécision pour chaque oeuvre de Moliere\n'+'-'*80)
accuracy_moliere = dict()
for book_path in all_txt_files_in_directory(os.path.join(directory, 'moliere')):
    (TP,TN,FP,FN) = compute_confusion_matrix_all_authors(split_book_into_paragraphs(book_path), [], most_common_moliere , most_common_corneille, 0)
    #print(f"Accuracy '{pathlib.Path(book_path).stem}': {round(calcul_accuracy(TP,TN,FP,FN),4)}")
    accuracy_moliere[pathlib.Path(book_path).stem] = calcul_accuracy(TP,TN,FP,FN)
for e in sorted(accuracy_moliere.items(), key=lambda x: x[1]):
    print(e)

print()
print('-'*80+'\nPrécision pour chaque oeuvre de Corneille\n'+'-'*80)
accuracy_corneille = dict()
for book_path in all_txt_files_in_directory(os.path.join(directory, 'corneille')):
    (TP,TN,FP,FN) = compute_confusion_matrix_all_authors([], split_book_into_paragraphs(book_path), most_common_moliere , most_common_corneille, 0)
    accuracy_corneille[pathlib.Path(book_path).stem] = calcul_accuracy(TP,TN,FP,FN)
for e in sorted(accuracy_corneille.items(), key=lambda x: x[1]):
    print(e)
    
    