In [1]:
import operator
from collections import Counter
import math
import os
import json
import gensim
from gensim import corpora

In [2]:
'''
Read in parsed books 
'''
def read_parsed_book_data(dataPath):
    data = {}
    for fileName in os.listdir(dataPath):
        with open(os.path.join(dataPath, fileName), 'r') as file:
            data[fileName.split("_parsed_")[0]]=json.load(file)
    print("Read " + str(len(data)) + " books")
    return data

In [None]:
'''
For parsing BookNLP formatted outputs.
'''

# def read_book_nlp_metadata(filename):
#     metadata={}
#     with open(filename) as file:
#         for line in file:
#             cols=line.rstrip().split("\t")
#             idd=cols[0]
#             date=int(cols[1])
#             author=cols[2]
#             title=cols[3]
#             author_gender=cols[4]
#             metadata[idd]=(date, author, title, author_gender)
#     return metadata

# def read_book_nlp_data(data_folder):
#     data={}

#     for idd in metadata:
#         filename=os.path.join(data_folder, "%s.book" % idd)
#         if os.path.exists(filename):
#             with open(filename) as file:
#                 data[idd]=json.load(file)

#     print(f"Read {len(data)} books")
#     return data

In [3]:
def get_corpus_frequencies(alldata, bookNLP=False):
    
    agentFreqs = Counter()
    patientFreqs =  Counter()
    modFreqs =  Counter()
    
    for idd in alldata:
        data=alldata[idd]
        for character in data["characters"]:
            if bookNLP:
                for term in character["agent"]:
                    agentFreqs[term['w']] += 1
                for term in character["patient"]:
                    patientFreqs[term['w']] += 1
                for term in character["mod"]:
                    modFreqs[term['w']] += 1 
            else:
                for action in character["agent"]:
                    agentFreqs[list(action.values())[0]] += 1
                for action in character["patient"]:
                    patientFreqs[list(action.values())[0]] += 1
                    
    return agentFreqs, patientFreqs, modFreqs 

In [10]:
def get_character_actions(alldata, agentFilter={}, patientFilter={}, modFilter={}, bookNLP=False):
    
    charactersAgentsDict = {}
    charactersAgentsList = []
    charNameList = []
    
    for idd in alldata:
        data=alldata[idd]
        if bookNLP:
            for character in data["characters"]:
                if len(character["names"]) > 0:
                    char_name = character["names"][0]['n']
                    charNameList.append(char_name)
                    charactersAgentsDict[char_name] = Counter()
                    charactersAgentsList.append([])
                    agent_terms=[term["w"] + "_agent" for term in character["agent"] \
                                 if term["w"] not in agentFilter]
                    agent_terms += [term["w"] + "_patient" for term in character["patient"]\
                                 if term["w"] not in patientFilter]
                    agent_terms += [term["w"] + "_mod" for term in character["mod"]\
                                 if term["w"] not in modFilter]

                    for verb in agent_terms:
                        charactersAgentsDict[char_name][verb]+=1

                    for verb, count in charactersAgentsDict[char_name].items():
                        charactersAgentsList[-1]+=[verb]*count
                        
        else:
            for character in data["characters"]:
                char_name = character["mainName"] + "\t(" + idd + ")" +\
                '\n aka ' + ", ".join(character['aliases'][:9])
                charNameList.append(char_name)
                charactersAgentsDict[char_name] = Counter()
                charactersAgentsList.append([])
                agent_terms = [list(action.values())[0] + "_agent" for action in character["agent"] \
                             if list(action.values())[0] not in agentFilter]
                agent_terms += [list(action.values())[0] + "_patient" for action in character["patient"] \
                             if list(action.values())[0] not in agentFilter]
                
                for verb in agent_terms:
                    charactersAgentsDict[char_name][verb]+=1

                for verb, count in charactersAgentsDict[char_name].items():
                    charactersAgentsList[-1]+=[verb]*count
                    
                    
    return charactersAgentsDict, charactersAgentsList, charNameList 

In [None]:
# metadata=read_book_nlp_metadata("../../../../Code/comphumF20/data/pulitzer_metadata.txt")
# data=read_book_nlp_data("../../../../Code/comphumF20/data/pulitzer_booknlp")

In [5]:
newData = read_parsed_book_data('../results/book_parses_top_only')

Read 239 books


In [6]:
agentFreqs, patientFreqs, modFreqs = get_corpus_frequencies(newData)

In [7]:
agentFilter = set([freqPair[0] for freqPair in agentFreqs.most_common(50)])
patientFilter = set([freqPair[0] for freqPair in patientFreqs.most_common(50)])
modFilter = set([freqPair[0] for freqPair in modFreqs.most_common(50)])

In [11]:
charAgents, charTexts, charNames = get_character_actions(newData, 
                                                         agentFilter=agentFilter, 
                                                         patientFilter=patientFilter, 
                                                         modFilter=modFilter)

In [12]:
for charName in charNames:
    print(charName)

he	(Joseph Conrad___Amy Foster)
 aka he, his
his	(Joseph Conrad___Amy Foster)
 aka his
she	(Joseph Conrad___Amy Foster)
 aka she, her
smith	(Joseph Conrad___Amy Foster)
 aka her, smith, she, he, the girl, his, mrs. smith, him
him	(Joseph Conrad___Amy Foster)
 aka him, his
carthoris	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
 aka her, she, his, carthoris, he
he	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
 aka he, his
thuvia	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
 aka thuvia
jav	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
 aka jav, he, his
his	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
 aka he, his
otto	(Robert Louis Stevenson___Prince Otto)
 aka he, him, otto, his
she	(Robert Louis Stevenson___Prince Otto)
 aka she, her
the prince	(Robert Louis Stevenson___Prince Otto)
 aka me, the prince, myself
seraphina	(Robert Louis Stevenson___Prince Otto)
 aka he, seraphina, his
he	(Robert Louis Stevenson___Prince Otto)
 aka himself, he, him, his
she	(Henry James___The Lesson of t

In [13]:
charAgents

{'he\t(Joseph Conrad___Amy Foster)\n aka he, his': Counter(),
 'his\t(Joseph Conrad___Amy Foster)\n aka his': Counter(),
 'she\t(Joseph Conrad___Amy Foster)\n aka she, her': Counter({'call_agent': 1}),
 'smith\t(Joseph Conrad___Amy Foster)\n aka her, smith, she, he, the girl, his, mrs. smith, him': Counter({'promise_agent': 1,
          'twist_agent': 1,
          'call_agent': 1,
          'moan_agent': 1,
          'wonder_agent': 1,
          'promise_patient': 1,
          'call_patient': 1,
          'moan_patient': 1}),
 'him\t(Joseph Conrad___Amy Foster)\n aka him, his': Counter(),
 'carthoris\t(Edgar Rice Burroughs___Thuvia, Maid of Mars)\n aka her, she, his, carthoris, he': Counter({'graze_agent': 1,
          'distance_agent': 1,
          'hasten_agent': 1,
          'reach_agent': 1}),
 'he\t(Edgar Rice Burroughs___Thuvia, Maid of Mars)\n aka he, his': Counter(),
 'thuvia\t(Edgar Rice Burroughs___Thuvia, Maid of Mars)\n aka thuvia': Counter(),
 'jav\t(Edgar Rice Burroughs__

In [14]:
charTexts

[[],
 [],
 ['call_agent'],
 ['promise_agent',
  'twist_agent',
  'call_agent',
  'moan_agent',
  'wonder_agent',
  'promise_patient',
  'call_patient',
  'moan_patient'],
 [],
 ['graze_agent', 'distance_agent', 'hasten_agent', 'reach_agent'],
 [],
 [],
 ['choose_agent'],
 ['furnish_agent'],
 ['watch_agent', 'resume_agent'],
 ['dream_agent'],
 [],
 ['hurry_agent'],
 ['lighten_agent', 'lighten_patient'],
 ['will_agent'],
 ['quit_agent',
  'recover_agent',
  'linger_agent',
  'publish_agent',
  'recover_patient',
  'linger_patient'],
 ['wonder_agent', 'suggest_agent'],
 [],
 [],
 [],
 [],
 ['display_agent', 'wish_patient'],
 ['bend_agent', 'wish_agent', 'reflect_agent'],
 [],
 [],
 ['cheer_agent'],
 [],
 [],
 ['bow_agent'],
 ['draw_agent', 'assure_patient'],
 ['happen_agent', 'admonish_agent', 'venture_agent'],
 [],
 ['step_agent', 'call_patient'],
 [],
 ['realize_agent'],
 [],
 ['mount_agent'],
 [],
 [],
 ['haunt_agent', 'appear_patient'],
 ['travel_agent'],
 [],
 ['avail_agent', 'avail_

In [15]:
dictionary = corpora.Dictionary(charTexts)
corpus = [dictionary.doc2bow(text) for text in charTexts]
num_topics = 5
lda_model = gensim.models.LdaModel(corpus, id2word=dictionary, 
                                   num_topics=num_topics, passes=3, alpha='auto')

In [16]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	pause_agent speak_agent smile_agent drop_agent sob_agent shut_agent marry_agent hurry_agent assure_patient keep_agent
topic 1:	remain_agent bring_patient enter_agent occur_agent confess_agent die_agent happen_agent fling_agent forget_agent move_patient
topic 2:	raise_agent keep_agent interrupt_agent bring_agent catch_agent offer_agent reach_agent hold_agent carry_agent mean_agent
topic 3:	write_agent whisper_agent pass_agent bear_agent hold_patient lay_agent want_agent marry_patient use_agent send_agent
topic 4:	show_agent call_agent call_patient cross_agent catch_agent write_agent catch_patient meet_agent stop_agent suppose_agent


In [17]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s" % (i,v, charNames[k]))
    print()
    
    

pause_agent speak_agent smile_agent drop_agent sob_agent shut_agent marry_agent hurry_agent assure_patient keep_agent

0	0.918	she	(Joseph Conrad___To-morrow)
 aka her, she, herself
0	0.908	cloete	(Joseph Conrad___Within the Tides)
 aka cloete, his, himself, he, him
0	0.895	he	(Jack London___Smoke Bellew)
 aka he, his
0	0.895	her	(Henry James___In the Cage)
 aka she, herself, her
0	0.895	the doctor	(Wilkie Collins___A Rogue's Life)
 aka the doctor, his, himself, he, him

remain_agent bring_patient enter_agent occur_agent confess_agent die_agent happen_agent fling_agent forget_agent move_patient

1	0.975	bartleby	(Herman Melville___Bartleby, The Scrivener)
 aka bartleby here, me, myself, bartleby, it, his, himself, he, him
1	0.965	him	(Herman Melville___Bartleby, The Scrivener)
 aka the silent man, me, myself, it, his fatal act--an act which certainly no man could possibly deplore more than the actor himself, his, himself, he, him
1	0.927	spencer brydon	(Henry James___The Jolly Corner)


In [None]:
# {"agent":[{"w":"wore","i":104276},
#           {"w":"had","i":104283},
#           {"w":"carried","i":104297},
#           {"w":"seemed","i":104306},
#           {"w":"said","i":104318},
#           {"w":"said","i":104358},
#           {"w":"said","i":107106},
#           {"w":"realized","i":107110},
#           {"w":"gave","i":107125}],
#  "NNPcount":3,
#  "names":[{"c":3,"n":"Dr. Whittaker"}],
#  "mod":[],
#  "speaking":[{"w":"`` Oh , good morning , '' ","i":104321},
#              {"w":"`` Of course , '' ","i":104353}],
#  "patient":[],
#  "g":2,
#  "id":0,
#  "poss":[{"w":"words","i":107127}]}