In [1]:
import operator
from collections import Counter
import math
from os import path
import json
import gensim
from gensim import corpora

In [2]:
def read_book_nlp_metadata(filename):
    metadata={}
    with open(filename) as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            date=int(cols[1])
            author=cols[2]
            title=cols[3]
            author_gender=cols[4]
            metadata[idd]=(date, author, title, author_gender)
    return metadata

def read_book_nlp_data(data_folder):
    data={}

    for idd in metadata:
        filename=path.join(data_folder, "%s.book" % idd)
        if path.exists(filename):
            with open(filename) as file:
                data[idd]=json.load(file)

    print(f"Read {len(data)} books")
    return data

In [34]:
def get_corpus_frequencies(alldata):
    
    agentFreqs = Counter()
    patientFreqs =  Counter()
    modFreqs =  Counter()
    
    for idd in alldata:
        data=alldata[idd]
        for character in data["characters"]:
            for term in character["agent"]:
                agentFreqs[term['w']] += 1
            for term in character["patient"]:
                patientFreqs[term['w']] += 1
            for term in character["mod"]:
                modFreqs[term['w']] += 1 
                    
    return agentFreqs, patientFreqs, modFreqs 

In [35]:
agentFreqs, patientFreqs, modFreqs = get_corpus_frequencies(data)

In [51]:
agentFilter = set([freqPair[0] for freqPair in agentFreqs.most_common(100)])
patientFilter = set([freqPair[0] for freqPair in patientFreqs.most_common(100)])
modFilter = set([freqPair[0] for freqPair in modFreqs.most_common(100)])

In [54]:
def get_character_actions(alldata, agentFilter={}, patientFilter={}, modFilter={}):
    
    charactersAgentsDict = {}
    charactersAgentsList = []
    charNameList = []
    
    for idd in alldata:
        data=alldata[idd]
        for character in data["characters"]:
            if len(character["names"]) > 0:
                char_name = character["names"][0]['n']
                charNameList.append(char_name)
                charactersAgentsDict[char_name] = Counter()
                charactersAgentsList.append([])
                agent_terms=[term["w"] + "_agent" for term in character["agent"] \
                             if term["w"] not in agentFilter]
                agent_terms += [term["w"] + "_patient" for term in character["patient"]\
                             if term["w"] not in patientFilter]
                agent_terms += [term["w"] + "_mod" for term in character["mod"]\
                             if term["w"] not in modFilter]

                for verb in agent_terms:
                    charactersAgentsDict[char_name][verb]+=1
                    
                for verb, count in charactersAgentsDict[char_name].items():
                    charactersAgentsList[-1]+=[verb]*count
                    
                    
    return charactersAgentsDict, charactersAgentsList, charNameList 

In [28]:
metadata=read_book_nlp_metadata("../../../../Code/comphumF20/data/pulitzer_metadata.txt")
data=read_book_nlp_data("../../../../Code/comphumF20/data/pulitzer_booknlp")

Read 94 books


In [55]:
charAgents, charTexts, charNames = get_character_actions(data, 
                                                         agentFilter=agentFilter, 
                                                         patientFilter=patientFilter, 
                                                         modFilter=modFilter)

In [56]:
charAgents

{'Dr. Whittaker': Counter({'carried_agent': 1}),
 'Mrs. Gunn': Counter(),
 'Sadie': Counter({'plowing_agent': 1,
          'meant_agent': 1,
          'answer_agent': 2,
          'breathing_agent': 1,
          'managed_agent': 1,
          'draw_agent': 1,
          'snapped_agent': 2,
          'cut_agent': 2,
          'slapped_agent': 1,
          'landed_agent': 1,
          'saying_agent': 2,
          'elaborated_agent': 1,
          'popping_agent': 1,
          'cleared_agent': 1,
          'learned_agent': 2,
          'inspected_agent': 1,
          'demanded_agent': 6,
          'admitted_agent': 1,
          'spewed_agent': 1,
          'spotted_agent': 1,
          'knocked_agent': 1,
          'led_agent': 2,
          'flashed_agent': 1,
          'sent_agent': 1,
          'fudge_agent': 1,
          'cook_agent': 1,
          'blazed_agent': 1,
          'paused_agent': 1,
          'waited_agent': 2,
          'twisted_agent': 1,
          'exclaimed_agent': 1,
    

In [8]:
charTexts

[['wore',
  'had',
  'carried',
  'seemed',
  'said',
  'said',
  'said',
  'realized',
  'gave'],
 ['were'],
 ['got',
  'recognize',
  'died',
  'knew',
  'laid',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'put',
  'had',
  'looked',
  'looked',
  'looked',
  'looked',
  'began',
  'thought',
  'nodded',
  'say',
  'spoke',
  'turned',
  'knows',
  'knows',
  'advising',
  'talk',
  'loses',
  'does',
  'crying'],
 ['left',
  'forgotten',
  'asked',
  'asked',
  'strode',
  'held',
  'changed',
  'looked',
  'looked',
  'looked',
  'frowning',
  'smiled',
  'smiled',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'said',
  'broke',
  'leaned',
  'speaking',
  'addressed',
  'praying',
  'spoke',
  'spoke',
  'closed',
  'took',
  'helped',
  'think',
  'wished',
  'wished',
  'hated',
  'hates',
  'hates',
  'hates',
  'hates',
  'hates',
  'hates',
  'hates',
  

In [17]:
charNames

['Dr. Whittaker',
 'Mrs. Gunn',
 'Sadie',
 'Jackson',
 'Lynch',
 'Mother',
 'Jessie',
 'Ettie Lou',
 'Mrs. Dekalb',
 'Mama',
 'Rufus',
 'Joel',
 'Amelia',
 'Victoria',
 'Abraham Lincoln',
 'George Bailey',
 'Jay',
 'Thomas',
 'Google',
 'Bell',
 'Miss Amy Field',
 'Ted',
 'Andrew',
 'Hannah',
 'Gordon Dekalb',
 'Roberts',
 'Uncle Ralph',
 'Grampa',
 'LaFollette',
 'Jesus',
 'Powell',
 'Miss Nettie Field',
 'Granmaw',
 'Dr. Dekalb',
 'Arthur',
 'Rhoda',
 'Walter',
 'John Henry',
 'Mr. Starr',
 'Carrie',
 'Mr. Ralph',
 'Jim-Wilson',
 'Tom',
 'Catherine',
 'Granma',
 'Jackie',
 'Morris',
 'Ann',
 'Brannick',
 'Celia Gunn',
 'Mary',
 'Cross',
 'Sally',
 'Ralph',
 'Tin Lizzie',
 'Daddy',
 'William S. Hart',
 'L&N',
 'Mrs. Follet',
 'Kate',
 'Oliver',
 'Charlie',
 'Sugar Babe',
 'Colm Larkin',
 'William Murray',
 'Dr. Kopeckny',
 'Michaels',
 'Helen',
 'Virgil',
 'Liwie',
 'Leo',
 'Nora',
 'Dr. Chouteau',
 'Lawrence',
 'Jane',
 'Lauren',
 'Dr. Fuller',
 'Copernicus',
 'Theo',
 'Agnes',
 'Ban

In [57]:
dictionary = corpora.Dictionary(charTexts)
corpus = [dictionary.doc2bow(text) for text in charTexts]
num_topics = 10
lda_model = gensim.models.LdaModel(corpus, id2word=dictionary, 
                                   num_topics=num_topics, passes=3, alpha='auto')

In [58]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	remember_patient invited_agent reading_patient accustomed_patient playing_patient purchased_agent whined_agent aged_agent departed_agent married_agent
topic 1:	read_patient artist_mod shouts_agent join_patient jes_agent divorced_patient giving_patient haunted_patient lose_patient play_patient
topic 2:	worked_agent spent_agent managed_agent stayed_agent followed_agent replied_agent leaned_agent pointed_agent wished_agent saying_agent
topic 3:	needed_agent looking_agent stepped_agent trying_agent coming_agent find_agent waited_agent died_agent fell_agent raised_agent
topic 4:	ast_agent say_patient sing_agent crouched_agent grab_agent pull_patient helped_agent counted_agent marry_agent Get_patient
topic 5:	met_agent arrived_agent imagined_agent remember_agent learned_agent noticed_agent glanced_agent understood_agent thinking_agent understand_agent
topic 6:	thinks_agent looks_agent goes_agent feels_agent tells_agent takes_agent comes_agent sees_agent does_agent asks_agent
topic 7

In [59]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s" % (i,v, charNames[k]))
    print()
    
    

remember_patient invited_agent reading_patient accustomed_patient playing_patient purchased_agent whined_agent aged_agent departed_agent married_agent

0	0.812	Mr. Treete
0	0.781	Jesus
0	0.777	Red Cross
0	0.741	Scotty
0	0.732	Billie Martin

read_patient artist_mod shouts_agent join_patient jes_agent divorced_patient giving_patient haunted_patient lose_patient play_patient

1	0.810	Bertie
1	0.791	Emil
1	0.783	Frances FitzGerald
1	0.781	Tilda
1	0.778	Winfiel

worked_agent spent_agent managed_agent stayed_agent followed_agent replied_agent leaned_agent pointed_agent wished_agent saying_agent

2	0.986	Ralph
2	0.980	Roy
2	0.973	Savage
2	0.970	Lois
2	0.970	Mother

needed_agent looking_agent stepped_agent trying_agent coming_agent find_agent waited_agent died_agent fell_agent raised_agent

3	0.979	Genaro
3	0.977	Neddy
3	0.976	Irene
3	0.976	Catherine
3	0.970	Clancy

ast_agent say_patient sing_agent crouched_agent grab_agent pull_patient helped_agent counted_agent marry_agent Get_patient

4	0.8

In [None]:
# {"agent":[{"w":"wore","i":104276},
#           {"w":"had","i":104283},
#           {"w":"carried","i":104297},
#           {"w":"seemed","i":104306},
#           {"w":"said","i":104318},
#           {"w":"said","i":104358},
#           {"w":"said","i":107106},
#           {"w":"realized","i":107110},
#           {"w":"gave","i":107125}],
#  "NNPcount":3,
#  "names":[{"c":3,"n":"Dr. Whittaker"}],
#  "mod":[],
#  "speaking":[{"w":"`` Oh , good morning , '' ","i":104321},
#              {"w":"`` Of course , '' ","i":104353}],
#  "patient":[],
#  "g":2,
#  "id":0,
#  "poss":[{"w":"words","i":107127}]}