In [1]:
import operator
from collections import Counter
import math
from os import path
import json
import gensim
from gensim import corpora

In [2]:
def read_book_nlp_metadata(filename):
    metadata={}
    with open(filename) as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            date=int(cols[1])
            author=cols[2]
            title=cols[3]
            author_gender=cols[4]
            metadata[idd]=(date, author, title, author_gender)
    return metadata

def read_book_nlp_data(data_folder):
    data={}

    for idd in metadata:
        filename=path.join(data_folder, "%s.book" % idd)
        if path.exists(filename):
            with open(filename) as file:
                data[idd]=json.load(file)

    print(f"Read {len(data)} books")
    return data

In [3]:
def get_character_actions(alldata):
    
    charactersAgentsDict = {}
    charactersAgentsList = []
    
    for idd in alldata:
        data=alldata[idd]
        for character in data["characters"]:
            if len(character["names"]) > 0:
                char_name = character["names"][0]['n']
                charactersAgentsDict[char_name] = Counter()
                charactersAgentsList.append([])
                agent_terms=[term["w"] for term in character["agent"]] 

                for verb in agent_terms:
                    charactersAgentsDict[char_name][verb]+=1
                    
                for verb, count in charactersAgentsDict[char_name].items():
                    charactersAgentsList[-1]+=[verb]*count
                    
                    
    return charactersAgentsDict, charactersAgentsList

In [9]:
metadata=read_book_nlp_metadata("../../../Code/comphumF20/data/pulitzer_metadata.txt")
data=read_book_nlp_data("../../../Code/comphumF20/data/pulitzer_booknlp")

Read 94 books


In [10]:
charAgents, charTexts = get_character_actions(data)

In [11]:
charAgents

{'Dr. Whittaker': Counter({'wore': 1,
          'had': 1,
          'carried': 1,
          'seemed': 1,
          'said': 3,
          'realized': 1,
          'gave': 1}),
 'Mrs. Gunn': Counter({'were': 1}),
 'Sadie': Counter({'see': 1,
          'plowing': 1,
          'meant': 1,
          'took': 4,
          'going': 1,
          'was': 14,
          'got': 3,
          'answer': 2,
          'breathing': 1,
          'managed': 1,
          'had': 10,
          'draw': 1,
          'said': 72,
          'snapped': 2,
          'felt': 1,
          'cut': 2,
          'have': 4,
          'say': 1,
          'slapped': 1,
          'landed': 1,
          'saying': 2,
          'stopped': 3,
          'elaborated': 1,
          'popping': 1,
          'spoke': 1,
          'cleared': 1,
          'wore': 1,
          'made': 4,
          'taken': 1,
          'put': 2,
          'been': 3,
          'learned': 2,
          'came': 5,
          'sat': 3,
          'inspected': 1,
 

In [12]:
charTexts

[['wore',
  'had',
  'carried',
  'seemed',
  'said',
  'said',
  'said',
  'realized',
  'gave'],
 ['were'],
 ['got',
  'recognize',
  'died',
  'knew',
  'laid',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'put',
  'had',
  'looked',
  'looked',
  'looked',
  'looked',
  'began',
  'thought',
  'nodded',
  'say',
  'spoke',
  'turned',
  'knows',
  'knows',
  'advising',
  'talk',
  'loses',
  'does',
  'crying'],
 ['left',
  'forgotten',
  'asked',
  'asked',
  'strode',
  'held',
  'changed',
  'looked',
  'looked',
  'looked',
  'frowning',
  'smiled',
  'smiled',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'broke',
  'leaned',
  'speaking',
  'addressed',
  'praying',
  'spoke',
  'spoke',
  'closed',
  'took',
  'helped',
  'think',
  'wished',
  'wished',
  'hated',
  'hates',
  'hates',
  'hates',
  'hates',
  'hates',
  'hates',
  'hates',
  

In [13]:
dictionary = corpora.Dictionary(charTexts)
corpus = [dictionary.doc2bow(text) for text in charTexts]
num_topics = 10
lda_model = gensim.models.LdaModel(corpus, id2word=dictionary, 
                                   num_topics=num_topics, passes=3, alpha='auto')

In [14]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	said thought had felt saw knew heard looked asked was
topic 1:	was had got went made came took come looked knew
topic 2:	put know left please shouted swore cries provided shouts included
topic 3:	said came smiled laughed seemed sitting standing paused raised looked
topic 4:	told had wanted said have was say knew did be
topic 5:	said asked looked told had was turned took came put
topic 6:	say says has is knows thinks look 's wants looks
topic 7:	going said wrote nudged smiles take talking has call are
topic 8:	died were pleaded taught grab lost retrieved build married reconciled
topic 9:	had was said came gave made began wore spoke found


In [None]:
# {"agent":[{"w":"wore","i":104276},
#           {"w":"had","i":104283},
#           {"w":"carried","i":104297},
#           {"w":"seemed","i":104306},
#           {"w":"said","i":104318},
#           {"w":"said","i":104358},
#           {"w":"said","i":107106},
#           {"w":"realized","i":107110},
#           {"w":"gave","i":107125}],
#  "NNPcount":3,
#  "names":[{"c":3,"n":"Dr. Whittaker"}],
#  "mod":[],
#  "speaking":[{"w":"`` Oh , good morning , '' ","i":104321},
#              {"w":"`` Of course , '' ","i":104353}],
#  "patient":[],
#  "g":2,
#  "id":0,
#  "poss":[{"w":"words","i":107127}]}