In [1]:
# Imports 
import time
import os
import re
import json
import collections

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load('en')

# Add neural coref to SpaCy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp, blacklist=False)

[nltk_data] Downloading package stopwords to /home/dmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<spacy.lang.en.English at 0x7f5948b66910>

In [None]:
# import torch
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# from transformers import pipeline 

# tokenizer = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

# model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

# nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)

In [56]:
'''
A class reresenting a mention of some entity. 
It consists of a text string, starting and ending indices
of the text span and a paragraph and sentence id. 
'''
class Mention:
    def __init__(self, text, start, end, par_id, sent_id):
        self.text =  text
        self.start = start
        self.end = end
        self.par_id = par_id
        self.sent_id = sent_id
        self.POS = []
        self.deps = set()
        
    def __repr__(self):
        rep = "Mention in par " + str(self.par_id) + " Sentence " \
              + str(self.sent_id) + " text:\n" + self.text
        return rep

'''
A class reresenting a character. 
It consists of a list of mentions, a set of aliases, 
a list of verbs that the character is the actor of (agent),
a list of verbs that the character is the receiver of (patient),
a list of adjectives that the character is described with (description).  
'''
class Character:
    def __init__(self, book, mainName):
        self.book = book
        self.mainName = mainName
        self.mentions = []
        self.aliases = set()
        self.agent = {}
        self.patient = []
        self.description = []
        
    def __repr__(self):
        rep = "Character: " + self.mainName + "\n"
        return rep
    
    '''
    Add a mention of the character to a list of mentions.
    '''
    def update_mention(self, mention):
        self.mentions.append(mention)
        self.aliases.update([mention.text.lower()])
        
    '''
    Match POS tags with character mentions
    '''
    def get_POS(self):
        for mention in self.mentions:
            span = range(mention.start, mention.end)
            for loc in span:
                mention.POS.append(self.book.pars[mention.par_id].POS_tags[loc])
                mention.deps.update([mention.POS[-1]['dep']])
                
    '''
    Function to find the verbs in sentences in which the character is mentioned 
    as the nsubj.
    '''
    def get_agent_verbs(self):
        for mention in self.mentions:
            verb = None
            if 'nsubj' in mention.deps:
                sent_POS_parse = self.book.pars[mention.par_id].sents[mention.sent_id].POS_tags
                for POS in sent_POS_parse:
                    if POS['dep'] == 'ROOT':
                        verb = POS['text']
                        # Get location tuple (paragraph, sentence, local idx)
                        global_loc = (mention.par_id, mention.sent_id, POS['loc'])
                if verb:
                    self.agent[global_loc] = verb
                    
        

class Sentence:
    def __init__(self, book, par, text, sent_id, bound):
        # Parent objects
        self.book = book
        self.par = par
        # starting and ending boundary for the sentence (relative to the paragraph)
        self.start = bound[0]
        self.end = bound[1]
        # Global starting and ending indices
        self.globalStart = self.par.start + self.start
        self.globalEnd = self.par.start + self.end 
        self.text = text
        self.sent_id = sent_id
        self.POS_tags = []
        
    def __repr__(self):
        rep = self.book.fileName + "\nParagraph " + str(self.par.par_id) \
        + "\nSentence " + str(self.sent_id) + " start " + str(self.start) \
        + " end " + str(self.end) + "\ntext:\n" + self.text
        return rep
        
'''
A class for a paragraph.
'''
class Paragraph:
    def __init__(self, book, text, par_id, bound):
        # Starting and ending indices for the paragraph
        self.start = bound[0]
        self.end = bound[1]
        # Refers to book parent object
        self.book = book
        # paragraph text
        self.text = text
        # paragraph index
        self.par_id = par_id
        # bool idnicating whether coref wroked
        self.has_coref = False
        # coref cluster list
        self.coref_clusts = []
        # list of sentences
        self.sents = []
        self.sent_bounds = []
        self.POS_tags = []
        
    def __repr__(self):
        rep = self.book.fileName + "\nParagraph " + str(self.par_id) + "\ntext:\n" + self.text
        return rep
    
    '''
    Split text into sentences
    '''
    def parse_into_sentences(self):
        doc = nlp(self.text)
        sentences = [sent for sent in doc.sents]
        self.sents = [Sentence(self.book, self, sent.text, 
                               sent_id, (sent.start, sent.end)) 
                      for sent_id, sent in enumerate(sentences)]
        
    def run_coref_POS(self):
        # Get coreference tags:
        doc = nlp(self.text)
        # Update whether the coref parser returned results
        self.has_coref = doc._.has_coref
        if doc._.has_coref:
            self.coref_clusts = doc._.coref_clusters
            self.book.parse_coref_clusts(self)
        # Get POS tags:

        # this uses the pretrained BERT model:
        # book[idx]["POS_tags"] = nlp_token_class(sentence)

        # For now we will use Spacy pos tagging because it gives us more fine-grained labels:
        self.POS_tags = []
        # Add POS tags to list for paragraph and for each individual sentence
        for token_id in range(len(doc)):
            token = doc[token_id]
            sent_id = 0
            for sent in self.sents:
                if ((token_id >= sent.start) and (token_id <= sent.end)):
                        sent_id = sent.sent_id
            self.sents[sent_id].POS_tags.append({
                'loc' : token_id,
                'text' : token.text,
                'lemma' : token.lemma_,
                'pos' : token.pos_, 
                'tag' : token.tag_, 
                'dep' : token.dep_})
            self.POS_tags.append({
                'loc' : token_id,
                'text' : token.text,
                'lemma' : token.lemma_,
                'pos' : token.pos_, 
                'tag' : token.tag_, 
                'dep' : token.dep_})

'''
A class representing a book.
'''        
class Book:
    def __init__(self, dataPath, fileName):
        self.dataPath = dataPath
        self.fileName = fileName
        self.text = ''
        self.characters = {}
        self.pars = []
        self.read_file()
    
    '''
    Read the text of the book from a txt file.
    '''
    def read_file(self):
        with open(os.path.join(self.dataPath, self.fileName), "r") as txtFile:
            self.text = txtFile.read()
        
    '''
    Break the text into paragraphs.
    '''
    def parse_into_pars(self):
        # split on newlines followed by space
        pars = re.split('\n\s', self.text)   
        par_bounds = [0]
        par_bounds += [m.start(0) for m in re.finditer('\n\s', self.text)]
        par_bounds.append(len(self.text) - 1)
        # Replace newline chars
        pars = [par.replace("\n", " ") for par in pars]
        # Remove empty pars
        pars = [par for par in pars if len(par) > 0]
        # Convert each paragraph into a Paragraph
        self.pars = [Paragraph(self, par, par_id, (par_bounds[par_id],
                                                   par_bounds[par_id+1])) 
                     for par_id, par in enumerate(pars)]
        
    '''
    Parse the coreference clusters returned from the parsing of a paragraph
    '''
    def parse_coref_clusts(self, par):
        clustList = par.coref_clusts
        # Iterate over the coreference clusters
        for idx, cluster in enumerate(clustList):
            # get the main cluster identity
            mainSpan = cluster.main.text.lower()
            # If a character object does not yet exist, create one
            if mainSpan not in self.characters:
                character = Character(self, mainSpan)
                # Add it to the dict of characters
                self.characters[mainSpan] = character
            # Otherwsie find the character referred to here
            else:
                character = self.characters[mainSpan] 
            for mention in cluster.mentions:
                # figure out which sentence the mention belongs to
                sent_id = 0
                for sent in par.sents:
                    if ((mention.start >= sent.start) and (mention.end <= sent.end)):
                        sent_id = sent.sent_id
                # create a mention object and add it to the character object
                mention = Mention(mention.text, mention.start, mention.end, par.par_id, sent_id)
                character.update_mention(mention)
                
    def parse_text(self):
        self.parse_into_pars()
        for par in self.pars:
            par.parse_into_sentences()
            par.run_coref_POS()
        for characterName, character in self.characters.items():
            character.get_POS()
            character.get_agent_verbs()

In [57]:
# Define Paths and fileNames:
dataPath = "../../Data/"
fileName = "Herman Melville___Bartleby, The Scrivener.txt"
stopwordsFileName = "StopWords/jockers.stopwords"
outFileName = 'Herman Melville___Bartleby, The Scrivener_sentences.json'

In [58]:
b = Book(dataPath, fileName)
b.parse_text()

In [59]:
b.pars[27]

Herman Melville___Bartleby, The Scrivener.txt
Paragraph 27
text:
A few days after this, Bartleby concluded four lengthy documents, being quadruplicates of a week's testimony taken before me in my High Court of Chancery.  It became necessary to examine them.  It was an important suit, and great accuracy was imperative.  Having all things arranged I called Turkey, Nippers and Ginger Nut from the next room, meaning to place the four copies in the hands of my four clerks, while I should read from the original.  Accordingly Turkey, Nippers and Ginger Nut had taken their seats in a row, each with his document in hand, when I called to Bartleby to join this interesting group.

In [60]:
Bart_mentions = b.characters['bartleby'].mentions
bart_sent_mentions = [b.pars[mention.par_id].sents[mention.sent_id].text for mention in Bart_mentions]

In [64]:
bart_sent_mentions

['While of other law-copyists I might write the complete life, of Bartleby nothing of that sort can be done.',
 'Bartleby was one of those beings of whom nothing is ascertainable, except from the original sources, and in his case those are very small.  ',
 'Bartleby was one of those beings of whom nothing is ascertainable, except from the original sources, and in his case those are very small.  ',
 'What my own astonished eyes saw of Bartleby, _that_ is all I know of him, except, indeed, one vague report which will appear in the sequel.',
 'I resolved to assign Bartleby a corner by the folding-doors, but on my side of them, so as to have this quiet man within easy call,',
 'Still further to a satisfactory arrangement, I procured a high green folding screen, which might entirely isolate Bartleby from my sight, though not remove him from my voice.  ',
 'Still further to a satisfactory arrangement, I procured a high green folding screen, which might entirely isolate Bartleby from my sight

In [63]:
Bart_mentions[

148

In [None]:
bart_sent_mentions

#### Below is some first pass testing that is commented out.

In [None]:
# '''
# Split text into sentences
# '''
# def parse_into_sentences(fileName):
#     with open(os.path.join(dataPath, fileName), "r") as txtFile:
#         doc = txtFile.read()
#     sentences = re.split('\.|!|\?', doc)   
#     sentences = [sentence.replace("\n", " ") for sentence in sentences]
#     return sentences
# '''
# Split text into paragraphs
# '''
# def parse_into_pars(fileName):
#     with open(os.path.join(dataPath, fileName), "r") as txtFile:
#         doc = txtFile.read()
#     pars = re.split('\n\s', doc)   
#     pars = [par.replace("\n", " ") for par in pars]
#     return pars
# '''
# Create list of stop words (currently not used)
# '''
# def read_stopwords(filename):
#     stopwords={}
#     with open(filename) as file:
#         for line in file:
#             stopwords[line.rstrip()]=1
#     return stopwords

# '''
# Clean sentences by removing trailing punctuation on words, 
# and converting to lowercase
# '''
# def clean_sentences(sentences, charsTOStrip = '\"\', '):
#     texts = [
#         [word.strip(charsTOStrip) for word in sentence.lower().split()]
#         for sentence in sentences]
#     return texts

In [None]:
# '''
# Convert list of neuralcoref.neuralcoref.Cluster objects 
# to list of dicts so that we can serialize it. 
# '''
# def convert_clust_to_list_dict(clustList):
#     dictList = []
#     characters = {character : {
#     "aliases" : set(),
#     "agent" : [],
#     "patient" : [],
#     "description" : [],
#     } for character in top_characters}
    
#     for idx, cluster in enumerate(clustList):
#         mainSpan = cluster.main
#         dictList.append({mainSpan.text.lower() : [{'start' : mention.start, 'end': mention.end, 'text' : mention.text}
#                                            for mention in cluster.mentions]})
#         characters[mainSpan.text.lower()] += 1
#     return dictList, characters

In [None]:
# # Creating book data structure that we will use.
# # For now it is just a dict, but maybe we can make it a class
# def create_book_struct(fileName):
    
#     characters = collections.Counter()
    
#     time_start = time.time()

#     # break file into paragraphs
#     pars = parse_into_pars(fileName)
#     #parse_into_sentences(fileName)

#     # initialize book obj
#     book = {}
#     # Iterate over each sentence and 
#     for idx, sentence in enumerate(sentences):

#         # Get sentence id and text
#         book[idx] = {}
#         book[idx]['text'] = sentence

#         # Get coreference tags:
#         doc = nlp(sentence)
#         book[idx]['has_coref'] = doc._.has_coref
#         if doc._.has_coref:
#             coref_clusts, chars = convert_clust_to_list_dict(doc._.coref_clusters)
#             book[idx]['coref_clusts'] = coref_clusts
            
#             characters += chars
#         else:
#             book[idx]['coref_clusts'] = []

#         # Get POS tags:

#         # this uses the pretrained BERT model:
#         # book[idx]["POS_tags"] = nlp_token_class(sentence)

#         # For now we will use Spacy pos tagging because it gives us more fine-grained labels:
#         book[idx]["POS_tags"] = []
#         for token in doc:
#             book[idx]["POS_tags"].append({
#                 'text' : token.text,
#                 'lemma' : token.lemma_,
#                 'pos' : token.pos_, 
#                 'tag' : token.tag_, 
#                 'dep' : token.dep_})
#     time_end = time.time()
#     print("Parsing book took ", round(time_end - time_start, 2), " secs")
#     return book, characters

In [None]:
# book, characters = create_book_struct(fileName)

In [None]:
# def write_book_to_disk(book, outFileName):
#     with open (os.path.join(dataPath, outFileName), 'w') as outFile:
#         json.dump(book, outFile, separators=(',', ':'), indent=4)

In [None]:
# write_book_to_disk(book, outFileName)

In [None]:
# # Only get top n most mentioned characters. 
# # Probably we should replace this with some kind of 
# # filter based on the min number of mentions
# most_common_n = 20
# top_characters = characters.most_common()[:most_common_n]
# top_characters = [top_character[0] for top_character in top_characters]

In [None]:
# doc = nlp('''
# I am a rather elderly man.  The nature of my avocations for the last
# thirty years has brought me into more than ordinary contact with what
# would seem an interesting and somewhat singular set of men, of whom as
# yet nothing that I know of has ever been written:--I mean the
# law-copyists or scriveners.  I have known very many of them,
# professionally and privately, and if I pleased, could relate divers
# histories, at which good-natured gentlemen might smile, and sentimental
# souls might weep.  But I waive the biographies of all other scriveners
# for a few passages in the life of Bartleby, who was a scrivener of the
# strangest I ever saw or heard of.  While of other law-copyists I might
# write the complete life, of Bartleby nothing of that sort can be done.
# I believe that no materials exist for a full and satisfactory biography
# of this man.  It is an irreparable loss to literature.  Bartleby was one
# of those beings of whom nothing is ascertainable, except from the
# original sources, and in his case those are very small.  What my own
# astonished eyes saw of Bartleby, _that_ is all I know of him, except,
# indeed, one vague report which will appear in the sequel.
# ''')

# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha)

In [19]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1208.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=791656.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1786.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=25.0), HTML(value='')))






HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=891692894.0), HTML(value='')))






'sadness'

In [36]:
def get_emotion(text):
    input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

    output = model.generate(input_ids=input_ids,
               max_length=2)

    dec = [tokenizer.decode(ids) for ids in output]
    label = dec[0]
    return label

get_emotion("i feel as if i havent blogged in ages are at least truly blogged i am doing an update cute") # Output: 'joy'

#get_emotion("i have a feeling i kinda lost my best friend") # Output: 'sadness'

'joy'