In [25]:
import operator
from collections import Counter
import math
import os
import json
import gensim
from gensim import corpora

In [45]:
'''
Read in parsed books 
'''
def read_parsed_book_data(dataPath):
    data = {}
    for fileName in os.listdir(dataPath):
        with open(os.path.join(dataPath, fileName), 'r') as file:
            data[fileName.split("_parsed_")[0]]=json.load(file)
    return data

In [2]:
# def read_book_nlp_metadata(filename):
#     metadata={}
#     with open(filename) as file:
#         for line in file:
#             cols=line.rstrip().split("\t")
#             idd=cols[0]
#             date=int(cols[1])
#             author=cols[2]
#             title=cols[3]
#             author_gender=cols[4]
#             metadata[idd]=(date, author, title, author_gender)
#     return metadata

# def read_book_nlp_data(data_folder):
#     data={}

#     for idd in metadata:
#         filename=os.path.join(data_folder, "%s.book" % idd)
#         if os.path.exists(filename):
#             with open(filename) as file:
#                 data[idd]=json.load(file)

#     print(f"Read {len(data)} books")
#     return data

In [60]:
def get_corpus_frequencies(alldata, bookNLP=False):
    
    agentFreqs = Counter()
    patientFreqs =  Counter()
    modFreqs =  Counter()
    
    for idd in alldata:
        data=alldata[idd]
        for character in data["characters"]:
            if bookNLP:
                for term in character["agent"]:
                    agentFreqs[term['w']] += 1
                for term in character["patient"]:
                    patientFreqs[term['w']] += 1
                for term in character["mod"]:
                    modFreqs[term['w']] += 1 
            else:
                for action in character["agent"]:
                    agentFreqs[list(action.values())[0]] += 1
                for action in character["patient"]:
                    patientFreqs[list(action.values())[0]] += 1
                    
    return agentFreqs, patientFreqs, modFreqs 

In [130]:
def get_character_actions(alldata, agentFilter={}, patientFilter={}, modFilter={}, bookNLP=False):
    
    charactersAgentsDict = {}
    charactersAgentsList = []
    charNameList = []
    
    for idd in alldata:
        data=alldata[idd]
        if bookNLP:
            for character in data["characters"]:
                if len(character["names"]) > 0:
                    char_name = character["names"][0]['n']
                    charNameList.append(char_name)
                    charactersAgentsDict[char_name] = Counter()
                    charactersAgentsList.append([])
                    agent_terms=[term["w"] + "_agent" for term in character["agent"] \
                                 if term["w"] not in agentFilter]
                    agent_terms += [term["w"] + "_patient" for term in character["patient"]\
                                 if term["w"] not in patientFilter]
                    agent_terms += [term["w"] + "_mod" for term in character["mod"]\
                                 if term["w"] not in modFilter]

                    for verb in agent_terms:
                        charactersAgentsDict[char_name][verb]+=1

                    for verb, count in charactersAgentsDict[char_name].items():
                        charactersAgentsList[-1]+=[verb]*count
                        
        else:
            for character in data["characters"]:
                char_name = character["mainName"] + "\t(" + idd + ")"
                #+ '\taka ' + \
                #", ".join(character['aliases'][:2]) + "\t(" + idd + ")"
                charNameList.append(char_name)
                charactersAgentsDict[char_name] = Counter()
                charactersAgentsList.append([])
                agent_terms = [list(action.values())[0] + "_agent" for action in character["agent"] \
                             if list(action.values())[0] not in agentFilter]
                agent_terms += [list(action.values())[0] + "_patient" for action in character["patient"] \
                             if list(action.values())[0] not in agentFilter]
                
                for verb in agent_terms:
                    charactersAgentsDict[char_name][verb]+=1

                for verb, count in charactersAgentsDict[char_name].items():
                    charactersAgentsList[-1]+=[verb]*count
                    
                    
    return charactersAgentsDict, charactersAgentsList, charNameList 

In [7]:
metadata=read_book_nlp_metadata("../../../../Code/comphumF20/data/pulitzer_metadata.txt")
data=read_book_nlp_data("../../../../Code/comphumF20/data/pulitzer_booknlp")

Read 94 books


In [131]:
newData = read_parsed_book_data('../results/book_parses_top_only')

In [132]:
agentFreqs, patientFreqs, modFreqs = get_corpus_frequencies(newData)

In [133]:
agentFilter = set([freqPair[0] for freqPair in agentFreqs.most_common(50)])
patientFilter = set([freqPair[0] for freqPair in patientFreqs.most_common(50)])
modFilter = set([freqPair[0] for freqPair in modFreqs.most_common(50)])

In [134]:
charAgents, charTexts, charNames = get_character_actions(newData, 
                                                         agentFilter=agentFilter, 
                                                         patientFilter=patientFilter, 
                                                         modFilter=modFilter)

In [135]:
for charName in charNames:
    print(charName)

carthoris	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
the girl	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
the man	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
astok of dusar	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
ptarth	(Edgar Rice Burroughs___Thuvia, Maid of Mars)
his	(Herman Melville___Moby Dick)
it	(Herman Melville___Moby Dick)
ahab	(Herman Melville___Moby Dick)
the mouth	(Herman Melville___Moby Dick)
me	(Herman Melville___Moby Dick)
they	(Washington Irving___The Legend of Sleepy Hollow)
ichabod crane	(Washington Irving___The Legend of Sleepy Hollow)
this sequestered glen	(Washington Irving___The Legend of Sleepy Hollow)
her	(Washington Irving___The Legend of Sleepy Hollow)
old baltus van tassel	(Washington Irving___The Legend of Sleepy Hollow)
tarzan	(Edgar Rice Burroughs___The Beasts of Tarzan)
the ape-man	(Edgar Rice Burroughs___The Beasts of Tarzan)
his	(Edgar Rice Burroughs___The Beasts of Tarzan)
it	(Edgar Rice Burroughs___The Beasts of Tarzan)
jane clayton's face

In [117]:
charAgents

{'carthoris\taka a real man, a real warrior--carthoris of helium!", themselves\t(Edgar Rice Burroughs___Thuvia, Maid of Mars)': Counter({'hesitate_agent': 2,
          'breathe_agent': 1,
          'raise_agent': 2,
          'back_agent': 1,
          'explain_agent': 2,
          'smile_agent': 7,
          'urge_agent': 2,
          'detect_agent': 1,
          'observe_agent': 1,
          'drop_agent': 1,
          'spring_agent': 3,
          'examine_agent': 1,
          'believe_agent': 4,
          'unlock_agent': 1,
          'hazard_agent': 1,
          'succeed_agent': 1,
          'wait_agent': 2,
          'send_agent': 1,
          'hurry_agent': 1,
          'touch_agent': 3,
          'damage_agent': 1,
          'float_agent': 1,
          'hope_agent': 2,
          'watch_agent': 2,
          'leap_agent': 2,
          'trail_agent': 1,
          'follow_agent': 5,
          'creep_agent': 1,
          'time_agent': 2,
          'surround_agent': 1,
          'guess_

In [118]:
charTexts

[['hesitate_agent',
  'hesitate_agent',
  'breathe_agent',
  'raise_agent',
  'raise_agent',
  'back_agent',
  'explain_agent',
  'explain_agent',
  'smile_agent',
  'smile_agent',
  'smile_agent',
  'smile_agent',
  'smile_agent',
  'smile_agent',
  'smile_agent',
  'urge_agent',
  'urge_agent',
  'detect_agent',
  'observe_agent',
  'drop_agent',
  'spring_agent',
  'spring_agent',
  'spring_agent',
  'examine_agent',
  'believe_agent',
  'believe_agent',
  'believe_agent',
  'believe_agent',
  'unlock_agent',
  'hazard_agent',
  'succeed_agent',
  'wait_agent',
  'wait_agent',
  'send_agent',
  'hurry_agent',
  'touch_agent',
  'touch_agent',
  'touch_agent',
  'damage_agent',
  'float_agent',
  'hope_agent',
  'hope_agent',
  'watch_agent',
  'watch_agent',
  'leap_agent',
  'leap_agent',
  'trail_agent',
  'follow_agent',
  'follow_agent',
  'follow_agent',
  'follow_agent',
  'follow_agent',
  'creep_agent',
  'time_agent',
  'time_agent',
  'surround_agent',
  'guess_agent',
  '

In [119]:
dictionary = corpora.Dictionary(charTexts)
corpus = [dictionary.doc2bow(text) for text in charTexts]
num_topics = 5
lda_model = gensim.models.LdaModel(corpus, id2word=dictionary, 
                                   num_topics=num_topics, passes=3, alpha='auto')

In [120]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	enter_agent smile_agent raise_agent spring_agent drop_agent walk_agent realize_agent continue_agent laugh_agent whisper_agent
topic 1:	lay_agent follow_agent work_agent drop_agent reckon_agent talk_agent hold_agent add_agent lead_agent show_agent
topic 2:	add_agent like_agent wait_agent walk_agent watch_agent open_agent repeat_agent live_agent occur_agent lay_agent
topic 3:	bring_agent show_agent drop_agent learn_agent watch_agent carry_agent strike_agent continue_agent lead_agent raise_agent
topic 4:	reckon_agent exclaim_agent show_agent murmur_agent use_agent continue_agent like_agent lay_agent shake_agent lay_patient


In [121]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s" % (i,v, charNames[k]))
    print()
    
    

enter_agent smile_agent raise_agent spring_agent drop_agent walk_agent realize_agent continue_agent laugh_agent whisper_agent

0	0.999	he	aka her face, the inner guards	(Edgar Rice Burroughs___The Gods of Mars)
0	0.999	his	aka his ship, his courageousness	(Herman Melville___Moby Dick)
0	0.999	korak	aka "korak!  , chapter 27  korak	(Edgar Rice Burroughs___The Son of Tarzan)
0	0.998	meriem	aka all meriem,  meriem	(Edgar Rice Burroughs___The Son of Tarzan)
0	0.998	carthoris	aka a real man, a real warrior--carthoris of helium!", themselves	(Edgar Rice Burroughs___Thuvia, Maid of Mars)

lay_agent follow_agent work_agent drop_agent reckon_agent talk_agent hold_agent add_agent lead_agent show_agent

1	0.998	 the long boat of the marjorie w.	aka the long rope, swede	(Edgar Rice Burroughs___The Son of Tarzan)
1	0.998	it	aka these traditions, itself	(Mark Twain___A Connecticut Yankee in King Arthur's Court, Complete)
1	0.998	we	aka himself, every man that went	(Mark Twain___Adventures of Huckleb

In [None]:
# {"agent":[{"w":"wore","i":104276},
#           {"w":"had","i":104283},
#           {"w":"carried","i":104297},
#           {"w":"seemed","i":104306},
#           {"w":"said","i":104318},
#           {"w":"said","i":104358},
#           {"w":"said","i":107106},
#           {"w":"realized","i":107110},
#           {"w":"gave","i":107125}],
#  "NNPcount":3,
#  "names":[{"c":3,"n":"Dr. Whittaker"}],
#  "mod":[],
#  "speaking":[{"w":"`` Oh , good morning , '' ","i":104321},
#              {"w":"`` Of course , '' ","i":104353}],
#  "patient":[],
#  "g":2,
#  "id":0,
#  "poss":[{"w":"words","i":107127}]}