In [65]:
import re

import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('conll2000')
nltk.download('averaged_perceptron_tagger')

from transformers import BertConfig, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, \
                         AutoTokenizer, AutoModelForTokenClassification, pipeline, BertTokenizer, BertModel, \
                         LukeTokenizer, LukeForEntitySpanClassification

from tqdm import tqdm, trange

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def get_book_text(gutenberg_id):
    '''Given a book ID, returns the book's text, excluding Project Gutenberg's header and outro.
    
     Parameters
    ----------
    gutenberg_id : int
        The book's Project Gutenberg ID

    Returns
    -------
    book_text : str
        The book's text, excluding Project Gutenberg's header and outro
    '''
    
    context = ''
    with open(f'../data/book/PG-{gutenberg_id}.txt', mode='r', encoding='utf-8') as f:
        context = f.read()
    return ' '.join([l for l in (context.split('End of the Project Gutenberg EBook of ')[0]
                                        .split('*** END OF THE PROJECT GUTENBERG EBOOK')[0]
                                        .split('\n')) if l][16:])

In [66]:
from transformers import CamembertModel, CamembertTokenizer
from transformers import FlaubertTokenizer, FlaubertModel, FlaubertForTokenClassification

import torch

In [27]:
french_stopwords = pd.read_csv('../data/stopwords-fr.txt', header = None)[0].values.tolist()

In [57]:
def get_line_entities(l, ner_entities_tokens, ner_entities_words, sentence_index, tokenizer, nlp,
                     grouped_entities):
    '''Given a line, lists for tokens and words, and word index at the end of the sentence, as well as
    the tokenizer and nlp model instances (from huggingface's transformers), updates the tokens and
    words lists and the word index to include the given line.

    Parameters
    ----------
    l : str
        The line to analyze
    ner_entities_tokens : list
        A list containing all the Person tokens found so far, across all the previous lines
    ner_entities_words : list
        A list containing dictionary entries of all the Person entities found so far (the full 
        word corresponding to them (i.e. not separated tokens), their index in the sentence and 
        in the book overall, and their PER-entity classification score, a number between 0.0 and 
        1.0), across all the previous lines
    sentence_index : int
        The overall (book-wise) index of the first word of the sentence
    tokenizer : AutoTokenizer
        huggingface's tokenizer being used in the NER pipeline
    nlp : pipeline
        huggingface's NER pipeline object
    grouped_entities : bool
        Flag indicating whether the NER pipeline is configured to output grouped_entities or not

    Returns
    -------
    ner_entities_tokens : list
        A list containing all the Person tokens found so far, across this and all the previous lines
    ner_entities_words : list
        A list containing dictionary entries of all the Person entities found so far (the full 
        word corresponding to them (i.e. not separated tokens), their index in the sentence and 
        in the book overall, and their PER-entity classification score, a number between 0.0 and 
        1.0), across this and all the previous lines
    sentence_index : int
        The overall (book-wise) index of the first word of the next sentence
    '''
    new_entity_tokens = []
    if grouped_entities:
        new_entity_tokens = [e for e in nlp(l) if 'PER' in e['entity_group']]
    else:
        new_entity_tokens = [e for e in nlp(l) if 'PER' in e['entity']]
    ner_entities_tokens += new_entity_tokens
    print('new entity tokens')
    print(new_entity_tokens)
    tokenized_line = tokenizer(l)
    line_words = [w if w != 'word_tokenize_splits_cannot_into_2_words' else 'cannot'
                    for w in word_tokenize(
                                           re.sub(r'[^a-zA-Z0-9À-ÿ]', ' \g<0> ', 
                                                  l).replace('cannot', 
                                                            'word_tokenize_splits_cannot_into_2_words'))]
    print('line words')
    print(line_words)
    # go from token to word with
    for et in new_entity_tokens:
        if grouped_entities:
            # find index of grouped entity
            reconstructed_line = ' '.join([lw.lower() for lw in line_words])
            first_word = word_tokenize(re.sub(r'[^a-zA-Z0-9_À-ÿ]', ' \g<0> ', et['word']))[0]
            if et['word'][0] == '#':
                first_word = word_tokenize(re.sub(r'[^a-zA-Z0-9_À-ÿ]', ' \g<0> ', et['word'][2:]))[0]
            
            word_index = len(reconstructed_line[:reconstructed_line.index(first_word)].split())

            if et['word'] not in french_stopwords and et['word'].isalpha():
                # record grouped entity
                ner_entities_words += [{'full_word': et['word'], 
                                        'sentence_word_index': word_index, 
                                        'total_word_index': sentence_index+word_index,
                                        'score': et['score']}]
        else:
            # record non-grouped entity
            word_index = tokenized_line.word_ids()[et['index']]
            if line_words[word_index] not in french_stopwords and line_words[word_index].isalpha():
                ner_entities_words += [{'full_word': line_words[word_index], 
                                        'sentence_word_index': word_index, 
                                        'total_word_index': sentence_index+word_index,
                                        'score': et['score']}]
    sentence_index += len(line_words)
    return ner_entities_tokens, ner_entities_words, sentence_index

In [70]:
from transformers import CamembertTokenizer, CamembertForTokenClassification

book_text =  get_book_text('798-8')
ner_model = 'Jean-Baptiste/camembert-ner'
tokenizer = CamembertTokenizer.from_pretrained(ner_model)
model = CamembertForTokenClassification.from_pretrained(ner_model)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
sentence_level_book = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', book_text)

l=sentence_level_book[65]
a,b,c = get_line_entities(l, [], [], 0, tokenizer, nlp,
                     False)

new entity tokens
[{'word': '▁M', 'score': 0.9855217337608337, 'entity': 'I-PER', 'index': 31, 'start': None, 'end': None}, {'word': '.', 'score': 0.9845024347305298, 'entity': 'I-PER', 'index': 32, 'start': None, 'end': None}, {'word': '▁le', 'score': 0.8523029685020447, 'entity': 'I-PER', 'index': 33, 'start': None, 'end': None}, {'word': '▁maire', 'score': 0.8729842901229858, 'entity': 'I-PER', 'index': 34, 'start': None, 'end': None}, {'word': '▁M', 'score': 0.9234569668769836, 'entity': 'I-PER', 'index': 74, 'start': None, 'end': None}, {'word': '.', 'score': 0.9141836166381836, 'entity': 'I-PER', 'index': 75, 'start': None, 'end': None}, {'word': '▁de', 'score': 0.9488356113433838, 'entity': 'I-PER', 'index': 76, 'start': None, 'end': None}, {'word': '▁R', 'score': 0.992038905620575, 'entity': 'I-PER', 'index': 77, 'start': None, 'end': None}, {'word': 'ê', 'score': 0.9914994239807129, 'entity': 'I-PER', 'index': 78, 'start': None, 'end': None}, {'word': 'nal', 'score': 0.9911445

ValueError: word_ids() is not available when using Python-based tokenizers

In [60]:
nlp(l)

[{'word': '▁d',
  'score': 0.5566989183425903,
  'entity': 'I-LOC',
  'index': 10,
  'start': 36,
  'end': 38},
 {'word': 'Italie',
  'score': 0.9557802677154541,
  'entity': 'I-LOC',
  'index': 12,
  'start': 39,
  'end': 45},
 {'word': '▁V',
  'score': 0.998179018497467,
  'entity': 'I-LOC',
  'index': 16,
  'start': 55,
  'end': 57},
 {'word': 'errière',
  'score': 0.9979278445243835,
  'entity': 'I-LOC',
  'index': 17,
  'start': 57,
  'end': 64},
 {'word': 's',
  'score': 0.9982990026473999,
  'entity': 'I-LOC',
  'index': 18,
  'start': 64,
  'end': 65},
 {'word': '▁M',
  'score': 0.9855217337608337,
  'entity': 'I-PER',
  'index': 31,
  'start': 112,
  'end': 114},
 {'word': '.',
  'score': 0.9845024347305298,
  'entity': 'I-PER',
  'index': 32,
  'start': 114,
  'end': 115},
 {'word': '▁le',
  'score': 0.8523029685020447,
  'entity': 'I-PER',
  'index': 33,
  'start': 115,
  'end': 118},
 {'word': '▁maire',
  'score': 0.8729842901229858,
  'entity': 'I-PER',
  'index': 34,
  's

In [46]:
def get_person_entities(gutenberg_id, grouped_entities=False, max_chunk_len=512, split_chunk_len=256):
    '''Given a book ID, returns its text (excluding Project Gutenberg's intro and outro), all its 
    tokens classified as PER (Person) entities, and all the words corresponding to those tokens, as 
    well as their index in the sentence and in the book, and their classification score as a PER entity.
    
    Parameters
    ----------
    gutenberg_id : int
        The book's Project Gutenberg ID
    grouped_entities : bool, optional
        Flag indicating whether the NER pipeline is configured to outout grouped_entities or not 
        (default is False)
    max_chunk_len : int, optional
        Maximum character-level length of each sentence passed to the model (default is 512)
    split_chunk_len : int, optional
        Maximum character-level length of each sub-sentence passed to the model, when splitting an
        overly big sentence into smaller sub-sentences (default is 256)

    Returns
    -------
    book_text : str
        The book's text, excluding Project Gutenberg's header and outro
    ner_entities_tokens : list
        A list containing all the Person tokens found across the whole book
    ner_entities_words : list
        A list containing dictionary entries of all the Person entities found across the whole book 
        (the full word corresponding to them (i.e. not separated tokens), their index in the sentence 
        and in the book overall, and their PER-entity classification score, a number between 0.0 and 
        1.0)
    '''
    # code is correct, but gives a warning about the model not having a predefined maximum length,
    # suppressing those warnings to not interfere with tdqm progress bar
    import warnings
    from transformers import logging
    warnings.filterwarnings('ignore')
    warnings.simplefilter('ignore')
    logging.set_verbosity_error()
    
    # read in gutenberg book
    book_text =  get_book_text(gutenberg_id)

    # load NER model and tokenizer
    ner_model = 'Jean-Baptiste/camembert-ner'
    tokenizer = AutoTokenizer.from_pretrained(ner_model)
    model = AutoModelForTokenClassification.from_pretrained(ner_model)
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    
    # prepare for iteration over the book
    sentence_level_book = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', book_text)
    ner_entities_tokens = []
    ner_entities_words = []
    sentence_index = 0
    
    # iterate over sentence-level chunks
    for l in tqdm(sentence_level_book):
        l = l.lower()
        if len(l) > max_chunk_len:
            for m in range(len(l) // split_chunk_len + 1):
                new_l = ' '.join(l.split(' ')[m*split_chunk_len:][:(m+1)*split_chunk_len])
                ner_entities_tokens, ner_entities_words, sentence_index = get_line_entities(new_l, 
                                                                                            ner_entities_tokens, 
                                                                                            ner_entities_words,
                                                                                            sentence_index, 
                                                                                            tokenizer,
                                                                                            nlp,
                                                                                            grouped_entities)
        else:
            ner_entities_tokens, ner_entities_words, sentence_index = get_line_entities(l, 
                                                                                        ner_entities_tokens, 
                                                                                        ner_entities_words,
                                                                                        sentence_index, 
                                                                                        tokenizer, 
                                                                                        nlp,
                                                                                        grouped_entities)

    return book_text, ner_entities_tokens, ner_entities_words

In [47]:
rouge_noir_id = '798-8'
(rouge_noir_text, 
 rouge_noir_ent_tokens, 
 rouge_noir_ent_words) = get_person_entities(rouge_noir_id, grouped_entities=False)

100%|██████████| 7929/7929 [32:26<00:00,  4.07it/s]     


In [42]:
sentence_level_book[65]

"Un vieux chirurgien-major de l'armée d'Italie, retiré à Verrières, et qui de son vivant était à la fois, suivant M. le maire, jacobin et bonapartiste, osa bien un jour se plaindre à lui de la mutilation périodique de ces beaux arbres. --J'aime l'ombre, répondit M. de Rênal avec la nuance de hauteur convenable quand on parle à un chirurgien, membre de la Légion d'honneur, j'aime l'ombre, je fais tailler mes arbres pour donner de l'ombre, et je ne conçois pas qu'un arbre soit fait pour autre chose, quand toutefois, comme l'utile noyer, il _ne rapporte pas de revenu_."

In [50]:
rouge_noir_ent_tokens

[{'word': 'bb',
  'score': 0.6076744198799133,
  'entity': 'I-PER',
  'index': 2,
  'start': 2,
  'end': 4},
 {'word': 'es',
  'score': 0.5385639071464539,
  'entity': 'I-PER',
  'index': 3,
  'start': 4,
  'end': 6},
 {'word': '▁n',
  'score': 0.9663979411125183,
  'entity': 'I-PER',
  'index': 34,
  'start': 115,
  'end': 117},
 {'word': 'ap',
  'score': 0.9647749066352844,
  'entity': 'I-PER',
  'index': 35,
  'start': 117,
  'end': 119},
 {'word': 'olé',
  'score': 0.9413847327232361,
  'entity': 'I-PER',
  'index': 36,
  'start': 119,
  'end': 122},
 {'word': 'on',
  'score': 0.9587520956993103,
  'entity': 'I-PER',
  'index': 37,
  'start': 122,
  'end': 124},
 {'word': '▁de',
  'score': 0.62737637758255,
  'entity': 'I-PER',
  'index': 11,
  'start': 33,
  'end': 36},
 {'word': '▁r',
  'score': 0.9827035665512085,
  'entity': 'I-PER',
  'index': 12,
  'start': 36,
  'end': 38},
 {'word': 'ê',
  'score': 0.9820846319198608,
  'entity': 'I-PER',
  'index': 13,
  'start': 38,
  'en

In [10]:
rouge_noir_ent_words

[{'full_word': 'é',
  'sentence_word_index': 6,
  'total_word_index': 6,
  'score': 0.8588476777076721},
 {'full_word': 'é',
  'sentence_word_index': 6,
  'total_word_index': 6,
  'score': 0.9381330013275146},
 {'full_word': 'accent',
  'sentence_word_index': 37,
  'total_word_index': 506,
  'score': 0.6058674454689026},
 {'full_word': 'è',
  'sentence_word_index': 6,
  'total_word_index': 779,
  'score': 0.8595004677772522},
 {'full_word': 'è',
  'sentence_word_index': 6,
  'total_word_index': 779,
  'score': 0.8854386210441589},
 {'full_word': 'res',
  'sentence_word_index': 7,
  'total_word_index': 780,
  'score': 0.9570423364639282},
 {'full_word': 'M',
  'sentence_word_index': 8,
  'total_word_index': 936,
  'score': 0.81282639503479},
 {'full_word': 'M',
  'sentence_word_index': 8,
  'total_word_index': 936,
  'score': 0.8347126245498657},
 {'full_word': 'é',
  'sentence_word_index': 21,
  'total_word_index': 1009,
  'score': 0.997407078742981},
 {'full_word': 'aussi',
  'sentenc

In [None]:
rougenoir_df = pd.DataFrame(rouge_noir_ent_tokens)
rouge_noir_df['word']

In [25]:
rouge_noir_df.groupby('full_word').count()

Unnamed: 0_level_0,sentence_word_index,total_word_index,score
full_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,21,21,21
abb,19,19,19
abord,5,5,5
absent,1,1,1
absurde,4,4,4
...,...,...,...
ë,4,4,4
î,10,10,10
ô,20,20,20
ù,10,10,10


In [21]:
# save entities into a dataframe, and to the disk
rouge_noir_df = pd.DataFrame(rouge_noir_ent_words)
# rouge_noir_df['full_word'] =  rouge_noir_df['full_word'].apply(lambda s: s.lower())
rouge_noir_df.to_csv('../data/book_dfs/rouge_noir_df.csv', index=False) 

In [51]:
# view top 25 entities
(rouge_noir_df
 .drop_duplicates('total_word_index')
 .groupby('full_word')
 .count()
 .sort_values(by='score', ascending=False)
)['score'][:50]

full_word
julien        450
é             369
mme           208
mathilde       98
è              75
valenod        70
mlle           63
mole           54
ê              48
marquis        43
ch             32
tait           31
sorel          29
re             24
fouqu          24
disait         21
tre            19
comte          19
pr             18
ç              17
grand          16
norbert        16
derville       15
û              14
croisenois     14
pensa          14
pirard         14
res            13
chevalier      11
nal            11
homme          11
monsieur       11
jour           10
mar            10
chambre        10
napol           9
saint           9
abb             9
tanbeau         8
dieu            8
esprit          8
appert          8
maire           7
ami             7
marquise        7
duc             7
amour           7
caract          7
altamira        7
voir            7
Name: score, dtype: int64

In [34]:
# save entities into a dataframe, and to the disk
rouge_noir_df = pd.DataFrame(rouge_noir_ent_words)
# rouge_noir_df['full_word'] =  rouge_noir_df['full_word'].apply(lambda s: s.lower())
rouge_noir_df.to_csv('../data/book_dfs/rouge_noir_df.csv', index=False) 

# view top 25 entities
(rouge_noir_df
 .drop_duplicates('total_word_index')
 .groupby('full_word')
 .count()
 .sort_values(by='score', ascending=False)
)['score'][:50]

full_word
julien        450
é             369
mme           208
mathilde       98
è              75
valenod        70
mlle           63
mole           54
ê              48
marquis        43
ch             32
tait           31
sorel          29
re             24
fouqu          24
disait         21
tre            19
comte          19
pr             18
ç              17
grand          16
norbert        16
derville       15
û              14
croisenois     14
pensa          14
pirard         14
res            13
chevalier      11
nal            11
homme          11
monsieur       11
jour           10
mar            10
chambre        10
napol           9
saint           9
abb             9
tanbeau         8
dieu            8
esprit          8
appert          8
maire           7
ami             7
marquise        7
duc             7
amour           7
caract          7
altamira        7
voir            7
Name: score, dtype: int64

In [20]:
len(stopwords.words('french'))

157