In [1]:
import re

import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('conll2000')
nltk.download('averaged_perceptron_tagger')

from transformers import BertConfig, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, \
                         AutoTokenizer, AutoModelForTokenClassification, pipeline, BertTokenizer, BertModel, \
                         LukeTokenizer, LukeForEntitySpanClassification

from tqdm import tqdm, trange

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/eloisedoyard/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def get_book_text(gutenberg_id):
    '''Given a book ID, returns the book's text, excluding Project Gutenberg's header and outro.
    
     Parameters
    ----------
    gutenberg_id : int
        The book's Project Gutenberg ID

    Returns
    -------
    book_text : str
        The book's text, excluding Project Gutenberg's header and outro
    '''
    
    context = ''
    with open(f'../data/book/PG-{gutenberg_id}.txt', mode='r', encoding='utf-8') as f:
        context = f.read()
    return ' '.join([l for l in (context.split('End of the Project Gutenberg EBook of ')[0]
                                        .split('*** END OF THE PROJECT GUTENBERG EBOOK')[0]
                                        .split('\n')) if l][16:])

In [3]:
from transformers import CamembertModel, CamembertTokenizer
from transformers import FlaubertTokenizer, FlaubertModel, FlaubertForTokenClassification

import torch

In [4]:
def get_line_entities(l, ner_entities_tokens, ner_entities_words, sentence_index, tokenizer, nlp,
                     grouped_entities):
    '''Given a line, lists for tokens and words, and word index at the end of the sentence, as well as
    the tokenizer and nlp model instances (from huggingface's transformers), updates the tokens and
    words lists and the word index to include the given line.

    Parameters
    ----------
    l : str
        The line to analyze
    ner_entities_tokens : list
        A list containing all the Person tokens found so far, across all the previous lines
    ner_entities_words : list
        A list containing dictionary entries of all the Person entities found so far (the full 
        word corresponding to them (i.e. not separated tokens), their index in the sentence and 
        in the book overall, and their PER-entity classification score, a number between 0.0 and 
        1.0), across all the previous lines
    sentence_index : int
        The overall (book-wise) index of the first word of the sentence
    tokenizer : AutoTokenizer
        huggingface's tokenizer being used in the NER pipeline
    nlp : pipeline
        huggingface's NER pipeline object
    grouped_entities : bool
        Flag indicating whether the NER pipeline is configured to output grouped_entities or not

    Returns
    -------
    ner_entities_tokens : list
        A list containing all the Person tokens found so far, across this and all the previous lines
    ner_entities_words : list
        A list containing dictionary entries of all the Person entities found so far (the full 
        word corresponding to them (i.e. not separated tokens), their index in the sentence and 
        in the book overall, and their PER-entity classification score, a number between 0.0 and 
        1.0), across this and all the previous lines
    sentence_index : int
        The overall (book-wise) index of the first word of the next sentence
    '''
    new_entity_tokens = []
    if grouped_entities:
        new_entity_tokens = [e for e in nlp(l) if 'PER' in e['entity_group']]
    else:
        new_entity_tokens = [e for e in nlp(l) if 'PER' in e['entity']]
    ner_entities_tokens += new_entity_tokens
    
    tokenized_line = tokenizer(l)
    line_words = [w if w != 'word_tokenize_splits_cannot_into_2_words' else 'cannot'
                    for w in word_tokenize(
                                           re.sub(r'[^a-zA-Z0-9]', ' \g<0> ', 
                                                  l).replace('cannot', 
                                                            'word_tokenize_splits_cannot_into_2_words'))]
    
    # go from token to word with
    for et in new_entity_tokens:
        if grouped_entities:
            # find index of grouped entity
            reconstructed_line = ' '.join([lw.lower() for lw in line_words])
            first_word = word_tokenize(re.sub(r'[^a-zA-Z0-9]', ' \g<0> ', et['word']))[0]
            if et['word'][0] == '#':
                first_word = word_tokenize(re.sub(r'[^a-zA-Z0-9]', ' \g<0> ', et['word'][2:]))[0]
            
            word_index = len(reconstructed_line[:reconstructed_line.index(first_word)].split())

            if et['word'] not in stopwords.words('french') and et['word'].isalpha():
                # record grouped entity
                ner_entities_words += [{'full_word': et['word'], 
                                        'sentence_word_index': word_index, 
                                        'total_word_index': sentence_index+word_index,
                                        'score': et['score']}]
        else:
            # record non-grouped entity
            word_index = tokenized_line.word_ids()[et['index']]
            if line_words[word_index] not in stopwords.words('french') and line_words[word_index].isalpha():
                ner_entities_words += [{'full_word': line_words[word_index], 
                                        'sentence_word_index': word_index, 
                                        'total_word_index': sentence_index+word_index,
                                        'score': et['score']}]
    sentence_index += len(line_words)
    return ner_entities_tokens, ner_entities_words, sentence_index

In [5]:
def get_person_entities(gutenberg_id, grouped_entities=False, max_chunk_len=512, split_chunk_len=256):
    '''Given a book ID, returns its text (excluding Project Gutenberg's intro and outro), all its 
    tokens classified as PER (Person) entities, and all the words corresponding to those tokens, as 
    well as their index in the sentence and in the book, and their classification score as a PER entity.
    
    Parameters
    ----------
    gutenberg_id : int
        The book's Project Gutenberg ID
    grouped_entities : bool, optional
        Flag indicating whether the NER pipeline is configured to outout grouped_entities or not 
        (default is False)
    max_chunk_len : int, optional
        Maximum character-level length of each sentence passed to the model (default is 512)
    split_chunk_len : int, optional
        Maximum character-level length of each sub-sentence passed to the model, when splitting an
        overly big sentence into smaller sub-sentences (default is 256)

    Returns
    -------
    book_text : str
        The book's text, excluding Project Gutenberg's header and outro
    ner_entities_tokens : list
        A list containing all the Person tokens found across the whole book
    ner_entities_words : list
        A list containing dictionary entries of all the Person entities found across the whole book 
        (the full word corresponding to them (i.e. not separated tokens), their index in the sentence 
        and in the book overall, and their PER-entity classification score, a number between 0.0 and 
        1.0)
    '''
    # code is correct, but gives a warning about the model not having a predefined maximum length,
    # suppressing those warnings to not interfere with tdqm progress bar
    import warnings
    from transformers import logging
    warnings.filterwarnings('ignore')
    warnings.simplefilter('ignore')
    logging.set_verbosity_error()
    
    # read in gutenberg book
    book_text =  get_book_text(gutenberg_id)

    # load NER model and tokenizer
    ner_model = 'Jean-Baptiste/camembert-ner'
    tokenizer = AutoTokenizer.from_pretrained(ner_model)
    model = AutoModelForTokenClassification.from_pretrained(ner_model)
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    
    # prepare for iteration over the book
    sentence_level_book = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', book_text)
    ner_entities_tokens = []
    ner_entities_words = []
    sentence_index = 0
    
    # iterate over sentence-level chunks
    for l in tqdm(sentence_level_book):
        if len(l) > max_chunk_len:
            for m in range(len(l) // split_chunk_len + 1):
                new_l = ' '.join(l.split(' ')[m*split_chunk_len:][:(m+1)*split_chunk_len])
                ner_entities_tokens, ner_entities_words, sentence_index = get_line_entities(new_l, 
                                                                                            ner_entities_tokens, 
                                                                                            ner_entities_words,
                                                                                            sentence_index, 
                                                                                            tokenizer,
                                                                                            nlp,
                                                                                            grouped_entities)
        else:
            ner_entities_tokens, ner_entities_words, sentence_index = get_line_entities(l, 
                                                                                        ner_entities_tokens, 
                                                                                        ner_entities_words,
                                                                                        sentence_index, 
                                                                                        tokenizer, 
                                                                                        nlp,
                                                                                        grouped_entities)

    return book_text, ner_entities_tokens, ner_entities_words

In [6]:
rouge_noir_id = '798-8'
(rouge_noir_text, 
 rouge_noir_ent_tokens, 
 rouge_noir_ent_words) = get_person_entities(rouge_noir_id)

100%|██████████| 7929/7929 [25:43<00:00,  5.14it/s]  


In [8]:
# save entities into a dataframe, and to the disk
rouge_noir_df = pd.DataFrame(rouge_noir_ent_words)
rouge_noir_df['full_word'] =  rouge_noir_df['full_word'].apply(lambda s: s.lower())
rouge_noir_df.to_csv('../data/book_dfs/rouge_noir_df.csv', index=False) 

# view top 25 entities
(rouge_noir_df
 .drop_duplicates('total_word_index')
 .groupby('full_word')
 .count()
 .sort_values(by='score', ascending=False)
)['score'][:25]

full_word
julien      577
é           377
mme         233
m           178
r           160
la          105
mathilde     97
è            80
valenod      73
mlle         66
mole         55
ê            50
le           38
marquis      38
ch           37
l            36
dit          33
tait         32
e            30
sorel        26
fouqu        25
re           24
comme        21
disait       21
si           21
Name: score, dtype: int64