In [38]:
# Imports 
import time
import os
import re
import json
import collections

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load('en')

# Add neural coref to SpaCy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp, blacklist=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nuwandavek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<spacy.lang.en.English at 0x7f4e22e6efd0>

In [39]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as nnf
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForSequenceClassification.from_pretrained("monologg/bert-base-cased-goemotions-original", return_dict=True)
import numpy as np
emotions = ['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']


In [40]:
'''
A class reresenting a mention of some entity. 
It consists of a text string, starting and ending indices
of the text span and a paragraph and sentence id. 
'''
class Mention:
    def __init__(self, text, start, end, par_id, sent_id):
        self.text =  text
        self.start = start
        self.end = end
        self.par_id = par_id
        self.sent_id = sent_id
        self.POS = []
        self.deps = set()
        
    def __repr__(self):
        rep = "Mention in par " + str(self.par_id) + " Sentence " \
              + str(self.sent_id) + " text:\n" + self.text
        return rep

'''
A class reresenting a character. 
It consists of a list of mentions, a set of aliases, 
a list of verbs that the character is the actor of (agent),
a list of verbs that the character is the receiver of (patient),
a list of adjectives that the character is described with (description).  
'''
class Character:
    def __init__(self, book, mainName):
        self.book = book
        self.mainName = mainName
        self.mentions = []
        self.unique_sents = {}
        self.aliases = set()
        self.agent = {}
        self.patient = []
        self.description = []
        
    def __repr__(self):
        rep = "Character: " + self.mainName + "\n"
        return rep
    
    '''
    Add a mention of the character to a list of mentions.
    '''
    def update_mention(self, mention):
        self.mentions.append(mention)
        self.aliases.update([mention.text.lower()])
        
    '''
    Match POS tags with character mentions
    '''
    def get_POS(self):
        for mention in self.mentions:
            span = range(mention.start, mention.end)
            for loc in span:
                mention.POS.append(self.book.pars[mention.par_id].POS_tags[loc])
                mention.deps.update([mention.POS[-1]['dep']])
                
    def get_unique_sent_mentions(self):
        for mention in self.mentions:
            self.unique_sents[(mention.par_id, mention.sent_id)] = \
            self.book.pars[mention.par_id].sents[mention.sent_id].text
                
    '''
    Function to find the verbs in sentences in which the character is mentioned 
    as the nsubj.
    '''
    def get_agent_verbs(self):
        for mention in self.mentions:
            verb = None
            if 'nsubj' in mention.deps:
                sent_POS_parse = self.book.pars[mention.par_id].sents[mention.sent_id].POS_tags
                for POS in sent_POS_parse:
                    if POS['dep'] == 'ROOT':
                        verb = POS['text']
                        # Get location tuple (paragraph, sentence, local idx)
                        global_loc = (mention.par_id, mention.sent_id, POS['loc'])
                if verb:
                    self.agent[global_loc] = verb
                    
        

class Sentence:
    def __init__(self, book, par, text, sent_id, bound):
        # Parent objects
        self.book = book
        self.par = par
        # starting and ending boundary for the sentence (relative to the paragraph)
        self.start = bound[0]
        self.end = bound[1]
        # Global starting and ending indices
        self.globalStart = self.par.start + self.start
        self.globalEnd = self.par.start + self.end 
        self.text = text
        self.sent_id = sent_id
        self.POS_tags = []
        
    def __repr__(self):
        rep = self.book.fileName + "\nParagraph " + str(self.par.par_id) \
        + "\nSentence " + str(self.sent_id) + " start " + str(self.start) \
        + " end " + str(self.end) + "\ntext:\n" + self.text
        return rep
        
'''
A class for a paragraph.
'''
class Paragraph:
    def __init__(self, book, text, par_id, bound):
        # Starting and ending indices for the paragraph
        self.start = bound[0]
        self.end = bound[1]
        # Refers to book parent object
        self.book = book
        # paragraph text
        self.text = text
        # paragraph index
        self.par_id = par_id
        # bool idnicating whether coref wroked
        self.has_coref = False
        # coref cluster list
        self.coref_clusts = []
        # list of sentences
        self.sents = []
        self.sent_bounds = []
        self.POS_tags = []
        
    def __repr__(self):
        rep = self.book.fileName + "\nParagraph " + str(self.par_id) + "\ntext:\n" + self.text
        return rep
    
    '''
    Split text into sentences
    '''
    def parse_into_sentences(self):
        doc = nlp(self.text)
        sentences = [sent for sent in doc.sents]
        self.sents = [Sentence(self.book, self, sent.text, 
                               sent_id, (sent.start, sent.end)) 
                      for sent_id, sent in enumerate(sentences)]
        
    def run_coref_POS(self):
        # Get coreference tags:
        doc = nlp(self.text)
        # Update whether the coref parser returned results
        self.has_coref = doc._.has_coref
        if doc._.has_coref:
            self.coref_clusts = doc._.coref_clusters
            self.book.parse_coref_clusts(self)
        # Get POS tags:

        # this uses the pretrained BERT model:
        # book[idx]["POS_tags"] = nlp_token_class(sentence)

        # For now we will use Spacy pos tagging because it gives us more fine-grained labels:
        self.POS_tags = []
        # Add POS tags to list for paragraph and for each individual sentence
        for token_id in range(len(doc)):
            token = doc[token_id]
            sent_id = 0
            for sent in self.sents:
                if ((token_id >= sent.start) and (token_id <= sent.end)):
                        sent_id = sent.sent_id
            self.sents[sent_id].POS_tags.append({
                'loc' : token_id,
                'text' : token.text,
                'lemma' : token.lemma_,
                'pos' : token.pos_, 
                'tag' : token.tag_, 
                'dep' : token.dep_})
            self.POS_tags.append({
                'loc' : token_id,
                'text' : token.text,
                'lemma' : token.lemma_,
                'pos' : token.pos_, 
                'tag' : token.tag_, 
                'dep' : token.dep_})

'''
A class representing a book.
'''        
class Book:
    def __init__(self, dataPath, fileName):
        self.dataPath = dataPath
        self.fileName = fileName
        self.text = ''
        self.characters = {}
        self.char_mention_counts = collections.Counter()
        self.top_characters = {}
        self.pars = []
        self.read_file()
    
    '''
    Read the text of the book from a txt file.
    '''
    def read_file(self):
        with open(os.path.join(self.dataPath, self.fileName), "r") as txtFile:
            self.text = txtFile.read()
        
    '''
    Break the text into paragraphs.
    '''
    def parse_into_pars(self):
        # split on newlines followed by space
        pars = re.split('\n\s', self.text)   
        par_bounds = [0]
        par_bounds += [m.start(0) for m in re.finditer('\n\s', self.text)]
        par_bounds.append(len(self.text) - 1)
        # Replace newline chars
        pars = [par.replace("\n", " ") for par in pars]
        # Remove empty pars
        pars = [par for par in pars if len(par) > 0]
        # Convert each paragraph into a Paragraph
        self.pars = [Paragraph(self, par, par_id, (par_bounds[par_id],
                                                   par_bounds[par_id+1])) 
                     for par_id, par in enumerate(pars)]
        
    '''
    Parse the coreference clusters returned from the parsing of a paragraph
    '''
    def parse_coref_clusts(self, par):
        clustList = par.coref_clusts
        # Iterate over the coreference clusters
        for idx, cluster in enumerate(clustList):
            # get the main cluster identity
            mainSpan = cluster.main.text.lower()
            # If a character object does not yet exist, create one
            if mainSpan not in self.characters:
                character = Character(self, mainSpan)
                # Add it to the dict of characters
                self.characters[mainSpan] = character
            # Otherwsie find the character referred to here
            else:
                character = self.characters[mainSpan] 
            for mention in cluster.mentions:
                # figure out which sentence the mention belongs to
                sent_id = 0
                for sent in par.sents:
                    if ((mention.start >= sent.start) and (mention.end <= sent.end)):
                        sent_id = sent.sent_id
                # create a mention object and add it to the character object
                mention = Mention(mention.text, mention.start, mention.end, par.par_id, sent_id)
                character.update_mention(mention)
                self.char_mention_counts[character.mainName] += 1
                
    def get_top_characters(self, n=5):
        # Get n most mentioned characters
        self.top_characters = self.char_mention_counts.most_common()[:n]
        self.top_characters = {character[0] : self.characters[character[0]]
                               for character in self.top_characters}
                
    def parse_text(self):
        self.parse_into_pars()
        for par in self.pars:
            par.parse_into_sentences()
            par.run_coref_POS()
        for characterName, character in self.characters.items():
            character.get_POS()
            character.get_agent_verbs()
            character.get_unique_sent_mentions()
        self.get_top_characters()

In [34]:
# Define Paths and fileNames:
dataPath = "../datasets/Gutenberg/txt/"
fileName = "Herman Melville___Bartleby, The Scrivener.txt"
stopwordsFileName = "StopWords/jockers.stopwords"
outFileName = 'Herman Melville___Bartleby, The Scrivener_sentences.json'

In [35]:
b = Book(dataPath, fileName)
b.parse_text()

In [36]:
def get_emotion(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    probs = nnf.softmax(logits, dim=1).data.numpy().squeeze()
    emotion = emotions[np.argmax(probs)]
    return  emotion, probs

In [37]:
for character in b.top_characters:
    for loc, sent in b.top_characters[character].unique_sents.items():
        emotion, probs = get_emotion(sent)
        print(character, loc, emotion)

oliver (59, 0) neutral
oliver (65, 0) neutral
oliver (107, 1) neutral
oliver (107, 2) neutral
oliver (107, 3) neutral
oliver (107, 4) neutral
oliver (108, 0) neutral
oliver (122, 0) neutral
oliver (122, 1) neutral
oliver (125, 0) neutral
oliver (125, 1) neutral
oliver (125, 5) neutral
oliver (156, 3) neutral
oliver (156, 4) neutral
oliver (156, 5) neutral
oliver (157, 2) neutral
oliver (161, 0) neutral
oliver (161, 1) neutral
oliver (164, 3) approval
oliver (164, 4) neutral
oliver (164, 5) neutral
oliver (164, 6) sadness
oliver (164, 7) sadness
oliver (166, 0) neutral
oliver (167, 0) confusion
oliver (167, 1) neutral
oliver (168, 2) neutral
oliver (170, 0) fear
oliver (170, 1) neutral
oliver (170, 2) neutral
oliver (178, 3) neutral
oliver (178, 4) neutral
oliver (181, 0) neutral
oliver (187, 2) neutral
oliver (204, 0) neutral
oliver (204, 1) neutral
oliver (204, 3) realization
oliver (204, 4) sadness
oliver (238, 0) neutral
oliver (238, 1) disgust
oliver (240, 1) neutral
oliver (240, 3

oliver (1469, 1) neutral
oliver (1477, 1) neutral
oliver (1477, 2) neutral
oliver (1483, 0) surprise
oliver (1483, 1) neutral
oliver (1485, 4) caring
oliver (1486, 0) amusement
oliver (1488, 0) neutral
oliver (1492, 0) confusion
oliver (1492, 1) neutral
oliver (1496, 0) neutral
oliver (1496, 1) neutral
oliver (1498, 0) neutral
oliver (1498, 1) neutral
oliver (1505, 0) neutral
oliver (1507, 0) neutral
oliver (1507, 1) neutral
oliver (1509, 0) curiosity
oliver (1509, 1) curiosity
oliver (1509, 2) optimism
oliver (1509, 3) approval
oliver (1521, 0) neutral
oliver (1521, 1) confusion
oliver (1521, 2) neutral
oliver (1528, 0) caring
oliver (1528, 2) neutral
oliver (1528, 3) neutral
oliver (1529, 0) neutral
oliver (1529, 1) neutral
oliver (1530, 0) sadness
oliver (1530, 1) neutral
oliver (1530, 2) neutral
oliver (1540, 1) joy
oliver (1545, 0) neutral
oliver (1548, 1) neutral
oliver (1551, 2) neutral
oliver (1554, 0) neutral
oliver (1554, 1) neutral
oliver (1554, 2) neutral
oliver (1555, 0) n

i (1179, 3) annoyance
i (1179, 4) disgust
i (1222, 2) neutral
i (1222, 3) neutral
i (1223, 0) neutral
i (1234, 2) neutral
i (1234, 3) gratitude
i (1253, 1) neutral
i (1255, 1) sadness
i (1255, 2) joy
i (1255, 3) joy
i (1259, 1) sadness
i (1273, 1) neutral
i (1273, 3) approval
i (1273, 4) approval
i (1273, 6) neutral
i (1294, 1) fear
i (1294, 3) joy
i (1302, 2) neutral
i (1302, 3) curiosity
i (1325, 1) neutral
i (1325, 3) curiosity
i (1326, 1) neutral
i (1326, 3) neutral
i (1338, 1) disapproval
i (1338, 2) desire
i (1338, 3) neutral
i (1352, 0) disappointment
i (1352, 1) neutral
i (1363, 1) neutral
i (1363, 2) neutral
i (1406, 1) neutral
i (1406, 4) neutral
i (1431, 2) desire
i (1431, 3) neutral
i (1436, 1) neutral
i (1437, 2) neutral
i (1443, 1) neutral
i (1455, 1) neutral
i (1455, 3) neutral
i (1457, 1) neutral
i (1462, 1) neutral
i (1487, 1) neutral
i (1514, 1) neutral
i (1516, 2) neutral
i (1516, 4) neutral
i (1525, 1) neutral
i (1525, 2) neutral
i (1525, 4) approval
i (1540, 1) joy

i (3403, 1) neutral
i (3403, 2) neutral
i (3404, 2) neutral
i (3405, 3) desire
i (3405, 5) neutral
i (3410, 1) admiration
i (3410, 2) neutral
i (3410, 3) neutral
i (3412, 1) caring
i (3413, 1) anger
i (3413, 2) disapproval
i (3413, 4) neutral
i (3417, 1) admiration
i (3417, 2) neutral
i (3417, 3) confusion
i (3425, 1) neutral
i (3425, 6) neutral
i (3427, 3) confusion
i (3427, 4) confusion
i (3430, 1) neutral
i (3430, 2) neutral
i (3463, 1) fear
i (3463, 2) fear
i (3465, 1) neutral
i (3465, 3) desire
i (3465, 4) fear
i (3465, 5) neutral
i (3467, 2) neutral
i (3476, 1) neutral
i (3476, 2) neutral
i (3487, 1) approval
i (3487, 2) neutral
i (3490, 0) disapproval
i (3490, 1) neutral
i (3490, 2) annoyance
i (3499, 2) neutral
i (3504, 1) anger
i (3512, 1) neutral
i (3512, 3) neutral
i (3518, 4) caring
i (3518, 7) annoyance
i (3520, 1) neutral
i (3523, 3) neutral
i (3523, 5) neutral
i (3523, 8) annoyance
i (3527, 4) neutral
i (3527, 5) neutral
i (3536, 2) neutral
i (3557, 1) annoyance
i (3566,

he (2685, 0) neutral
he (2685, 1) neutral
he (2696, 1) desire
he (2709, 0) neutral
he (2709, 1) neutral
he (2721, 1) neutral
he (2729, 1) neutral
he (2729, 2) neutral
he (2744, 1) sadness
he (2744, 4) anger
he (2745, 0) neutral
he (2773, 1) neutral
he (2773, 3) curiosity
he (2776, 0) neutral
he (2777, 1) neutral
he (2866, 1) neutral
he (2866, 2) neutral
he (2892, 1) neutral
he (2892, 2) neutral
he (2892, 3) neutral
he (2896, 0) neutral
he (2983, 1) neutral
he (2985, 1) neutral
he (2985, 2) neutral
he (2985, 3) neutral
he (2989, 1) neutral
he (2989, 2) neutral
he (2991, 2) neutral
he (2991, 8) neutral
he (2997, 1) neutral
he (2997, 2) neutral
he (3052, 1) neutral
he (3056, 1) anger
he (3057, 1) neutral
he (3058, 1) neutral
he (3067, 1) neutral
he (3067, 3) neutral
he (3067, 5) love
he (3068, 0) neutral
he (3077, 1) neutral
he (3077, 3) admiration
he (3077, 7) neutral
he (3093, 3) neutral
he (3115, 0) neutral
he (3123, 1) neutral
he (3149, 1) admiration
he (3149, 2) neutral
he (3149, 3) 

you (2331, 3) neutral
you (2353, 1) curiosity
you (2353, 2) curiosity
you (2355, 0) neutral
you (2355, 1) neutral
you (2355, 2) fear
you (2355, 3) neutral
you (2448, 1) neutral
you (2465, 1) curiosity
you (2465, 2) curiosity
you (2471, 1) joy
you (2471, 2) neutral
you (2485, 6) annoyance
you (2487, 0) neutral
you (2499, 1) neutral
you (2510, 3) neutral
you (2552, 1) neutral
you (2552, 2) desire
you (2555, 4) love
you (2555, 5) love
you (2555, 6) neutral
you (2555, 8) annoyance
you (2555, 9) sadness
you (2560, 1) neutral
you (2560, 2) pride
you (2560, 3) approval
you (2576, 3) neutral
you (2582, 2) neutral
you (2587, 1) curiosity
you (2587, 2) neutral
you (2589, 2) neutral
you (2589, 3) neutral
you (2589, 4) neutral
you (2603, 1) nervousness
you (2603, 2) neutral
you (2603, 3) neutral
you (2621, 0) neutral
you (2656, 1) neutral
you (2656, 2) disapproval
you (2656, 3) admiration
you (2666, 0) curiosity
you (2668, 1) neutral
you (2668, 2) neutral
you (2668, 3) neutral
you (2676, 1) neutra

his (3669, 4) neutral
his (3669, 5) neutral
his (3669, 6) neutral
his (3675, 0) neutral
his (3675, 1) neutral
his (3699, 0) nervousness
his (3706, 1) neutral
his (3706, 2) curiosity
his (3714, 1) neutral
his (3724, 0) neutral
his (3740, 5) neutral
his (3740, 6) neutral
his (3740, 7) neutral
his (3751, 0) neutral
his (3754, 3) neutral
his (3754, 5) neutral
his (3771, 2) excitement
his (3854, 0) neutral
his (3862, 0) fear
his (3889, 3) neutral
his (3889, 4) neutral
his (3889, 5) neutral
his (3930, 1) annoyance
his (3930, 2) desire
his (3964, 4) realization
his (3986, 0) neutral
his (3986, 1) surprise
his (3986, 2) neutral
his (3996, 0) neutral
his (4000, 0) sadness
his (4000, 1) neutral
his (4000, 2) neutral
his (4000, 3) fear
his (4000, 4) fear
his (4017, 1) neutral
his (4044, 0) neutral
his (4044, 1) neutral
his (4045, 0) neutral
his (4045, 1) neutral
his (4048, 0) neutral
his (4048, 1) neutral
his (4048, 2) admiration
his (4048, 3) neutral
his (4049, 1) disappointment
his (4049, 3) ne

#### Below is some first pass testing that is commented out.

In [None]:
# '''
# Split text into sentences
# '''
# def parse_into_sentences(fileName):
#     with open(os.path.join(dataPath, fileName), "r") as txtFile:
#         doc = txtFile.read()
#     sentences = re.split('\.|!|\?', doc)   
#     sentences = [sentence.replace("\n", " ") for sentence in sentences]
#     return sentences
# '''
# Split text into paragraphs
# '''
# def parse_into_pars(fileName):
#     with open(os.path.join(dataPath, fileName), "r") as txtFile:
#         doc = txtFile.read()
#     pars = re.split('\n\s', doc)   
#     pars = [par.replace("\n", " ") for par in pars]
#     return pars
# '''
# Create list of stop words (currently not used)
# '''
# def read_stopwords(filename):
#     stopwords={}
#     with open(filename) as file:
#         for line in file:
#             stopwords[line.rstrip()]=1
#     return stopwords

# '''
# Clean sentences by removing trailing punctuation on words, 
# and converting to lowercase
# '''
# def clean_sentences(sentences, charsTOStrip = '\"\', '):
#     texts = [
#         [word.strip(charsTOStrip) for word in sentence.lower().split()]
#         for sentence in sentences]
#     return texts

In [None]:
# '''
# Convert list of neuralcoref.neuralcoref.Cluster objects 
# to list of dicts so that we can serialize it. 
# '''
# def convert_clust_to_list_dict(clustList):
#     dictList = []
#     characters = {character : {
#     "aliases" : set(),
#     "agent" : [],
#     "patient" : [],
#     "description" : [],
#     } for character in top_characters}
    
#     for idx, cluster in enumerate(clustList):
#         mainSpan = cluster.main
#         dictList.append({mainSpan.text.lower() : [{'start' : mention.start, 'end': mention.end, 'text' : mention.text}
#                                            for mention in cluster.mentions]})
#         characters[mainSpan.text.lower()] += 1
#     return dictList, characters

In [None]:
# # Creating book data structure that we will use.
# # For now it is just a dict, but maybe we can make it a class
# def create_book_struct(fileName):
    
#     characters = collections.Counter()
    
#     time_start = time.time()

#     # break file into paragraphs
#     pars = parse_into_pars(fileName)
#     #parse_into_sentences(fileName)

#     # initialize book obj
#     book = {}
#     # Iterate over each sentence and 
#     for idx, sentence in enumerate(sentences):

#         # Get sentence id and text
#         book[idx] = {}
#         book[idx]['text'] = sentence

#         # Get coreference tags:
#         doc = nlp(sentence)
#         book[idx]['has_coref'] = doc._.has_coref
#         if doc._.has_coref:
#             coref_clusts, chars = convert_clust_to_list_dict(doc._.coref_clusters)
#             book[idx]['coref_clusts'] = coref_clusts
            
#             characters += chars
#         else:
#             book[idx]['coref_clusts'] = []

#         # Get POS tags:

#         # this uses the pretrained BERT model:
#         # book[idx]["POS_tags"] = nlp_token_class(sentence)

#         # For now we will use Spacy pos tagging because it gives us more fine-grained labels:
#         book[idx]["POS_tags"] = []
#         for token in doc:
#             book[idx]["POS_tags"].append({
#                 'text' : token.text,
#                 'lemma' : token.lemma_,
#                 'pos' : token.pos_, 
#                 'tag' : token.tag_, 
#                 'dep' : token.dep_})
#     time_end = time.time()
#     print("Parsing book took ", round(time_end - time_start, 2), " secs")
#     return book, characters

In [None]:
# book, characters = create_book_struct(fileName)

In [None]:
# def write_book_to_disk(book, outFileName):
#     with open (os.path.join(dataPath, outFileName), 'w') as outFile:
#         json.dump(book, outFile, separators=(',', ':'), indent=4)

In [None]:
# write_book_to_disk(book, outFileName)

In [None]:
# # Only get top n most mentioned characters. 
# # Probably we should replace this with some kind of 
# # filter based on the min number of mentions
# most_common_n = 20
# top_characters = characters.most_common()[:most_common_n]
# top_characters = [top_character[0] for top_character in top_characters]

In [None]:
# doc = nlp('''
# I am a rather elderly man.  The nature of my avocations for the last
# thirty years has brought me into more than ordinary contact with what
# would seem an interesting and somewhat singular set of men, of whom as
# yet nothing that I know of has ever been written:--I mean the
# law-copyists or scriveners.  I have known very many of them,
# professionally and privately, and if I pleased, could relate divers
# histories, at which good-natured gentlemen might smile, and sentimental
# souls might weep.  But I waive the biographies of all other scriveners
# for a few passages in the life of Bartleby, who was a scrivener of the
# strangest I ever saw or heard of.  While of other law-copyists I might
# write the complete life, of Bartleby nothing of that sort can be done.
# I believe that no materials exist for a full and satisfactory biography
# of this man.  It is an irreparable loss to literature.  Bartleby was one
# of those beings of whom nothing is ascertainable, except from the
# original sources, and in his case those are very small.  What my own
# astonished eyes saw of Bartleby, _that_ is all I know of him, except,
# indeed, one vague report which will appear in the sequel.
# ''')

# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha)

In [19]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1208.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=791656.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1786.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=25.0), HTML(value='')))






HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=891692894.0), HTML(value='')))






'sadness'

tensor([[   0, 3922]])

In [97]:
b.characters['i'].agent

{(2, 0, 1): 'am',
 (2, 1, 52): 'mean',
 (2, 2, 63): 'known',
 (2, 3, 101): 'waive',
 (2, 4, 141): 'write',
 (2, 5, 157): 'believe',
 (2, 8, 215): 'saw',
 (7, 0, 11): 'had',
 (7, 6, 195): 'seemed',
 (7, 7, 222): 'are',
 (12, 13, 358): 'was',
 (12, 14, 402): 'have',
 (12, 18, 538): 'had',
 (12, 23, 642): 'reasoned',
 (12, 24, 654): 'was',
 (12, 26, 708): 'presented',
 (12, 27, 746): 'thought',
 (12, 29, 769): 'believe',
 (13, 0, 10): 'had',
 (13, 1, 20): 'was',
 (13, 4, 108): 'spread',
 (17, 0, 9): 'engaged',
 (17, 1, 25): 'sedate',
 (18, 0, 3): 'stated',
 (18, 1, 38): 'threw',
 (18, 2, 49): 'resolved',
 (18, 4, 91): 'placed',
 (18, 6, 193): 'procured',
 (20, 3, 74): 'imagine',
 (20, 4, 113): 'say',
 (22, 0, 6): 'sit',
 (22, 1, 60): 'replied',
 (23, 0, 1): 'sat',
 (23, 2, 43): 'assume',
 (23, 3, 61): 'prefer',
 (24, 0, 6): 'echoed',
 (24, 1, 27): 'mean',
 (24, 2, 30): 'Are',
 (24, 3, 48): 'take',
 (25, 0, 8): 'said',
 (26, 0, 1): 'looked',
 (26, 4, 51): 'been',
 (26, 5, 78): 'have',
 (26

4