# Set up environment

In [1]:
!pip install boto3

import io
import os
import re
import json
import time
import boto3
import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pymongo
from pymongo import MongoClient



In [2]:
cd '/content/drive/My Drive/FRUS/'

/content/drive/.shortcut-targets-by-id/128/FRUS


**Locate AWS credentials and start EC2 instance**

In [2]:
aws_id, aws_secret_key = pd.read_csv('DesiPilla_accessKeys.csv').values[0]

In [3]:
# Configure s3 resources
bucket = "frus-corenlp"
s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret_key)
s3_resource = boto3.resource('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret_key)

# Configure Lambda client
lambda_client = boto3.client('lambda', 'us-east-1', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret_key)

# Configure SQS client
sqs = boto3.client('sqs', 'us-east-1', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret_key)
queue_url = "https://sqs.us-east-1.amazonaws.com/654288102859/frus-unparsed"

In [4]:
instance_id = 'i-02d5c9566a7b7c3de'
ec2 = boto3.resource('ec2', 
                     'us-east-1',
                     aws_access_key_id=aws_id, 
                     aws_secret_access_key=aws_secret_key)

instance = ec2.Instance(instance_id)
instance.state

{'Code': 16, 'Name': 'running'}

In [6]:
instance.start()
print('Initializing...')
i = 0
while instance.state['Name'] != 'running':
    print('\t{} seconds elapsed... ({})'.format(i, instance.state['Name']))
    time.sleep(5)
    i += 5
    instance = ec2.Instance(instance_id)
print('The EC2 instance is now running.')

Initializing...
	0 seconds elapsed... (pending)
	5 seconds elapsed... (pending)
	10 seconds elapsed... (pending)
	15 seconds elapsed... (pending)
	20 seconds elapsed... (pending)
	25 seconds elapsed... (pending)
	30 seconds elapsed... (pending)
	35 seconds elapsed... (pending)
The EC2 instance is now running.


**SSH into the MongoDB located on the EC2 instance**

In [None]:
# Print current IP address
!curl ipecho.net/plain

104.154.205.204

In [None]:
mongo = MongoClient('mongodb://52.71.228.156:27017/')   # AWS
# mongo = MongoClient('mongodb://68.84.71.242:27017/FRUS')      # Desi's Laptop
mongo.list_database_names()

**Select or create a database named `FRUS`**

In [None]:
db = mongo.FRUS
db.list_collection_names()

['ParsedDocs', 'Taft']

In [None]:
collection = db.ParsedDocs

# Taft Collection

**Take the preprared data and convert it to a JSON file**

In [None]:
def prepare_collection(path=None, df=None):
    if path:
        df = pd.read_csv(path).rename(columns={'website':'source', 'text':'content'})

    df['stanford'] = 0
    df['_id'] = df.id
    return df.to_dict('records')

In [None]:
taft_collection = prepare_collection('Raw Data/Combined Data/taft_df.csv')
taft_collection[:2]

[{'Unnamed: 0': 106432,
  'context': 'The Ambassador in France ( Herrick ) to the Secretary of State American Embassy , Paris , July 28, 1914, 4 p.m. [ Received 7:30 p.m. ] [Telegram] To be communicated to the President: Situation in Europe is regarded here as the gravest in history. It is apprehended that civilization is threatened by demoralization which would follow a general conflagration. Demonstrations made against war here last night by laboring classes; it is said to be the first instance of its kind in France. It is felt that if Germany once mobilizes no backward step will be taken. France has strong reliance on her army but it is not giving way to undue excitement. There is faith and reliance on our high ideals and purposes, so that I believe expression from our nation would have great weight in this crisis. My opinion is encouraged at reception given utterances of British Minister for Foreign Affairs. I believe that a strong plea for delay and moderation from the President o

**Add the JSON file to the MongoDB as a collection**

In [None]:
db.Taft.insert_many(taft_collection, ordered=False)
print("Data has been exported to MongoDB server.")

Data has been exported to MongoDB server.


In [None]:
db.list_collection_names()

['Taft']

# All Presidents

**Take the preprared data and convert it to a JSON file**

In [None]:
all_presidents_collection = prepare_collection('Raw Data/Combined Data/all_presidents_df.csv')
all_presidents_collection[:2]

[{'Unnamed: 0': 0,
  'context': 'Memorandum of Conversation, by the Officer in Charge of West, Central, and East Africa Affairs ( Feld ) [ Washington ,] February 20, 1952 . Participants: Ford Foundation—Mr. Carl B. Spaeth Mr. John Howard Mr. Howard Tolley AF —Mr. Bourgerie Mr. Feld Mr. Meier DRN —Mr. Brown NEA/P —Mr. Fisk Mrs. Sloan Messrs. Spaeth , Howard and Tolley of the Ford Foundation came to the Department on Wednesday, February 20, 1952, to discuss in general terms the Foundation’s interest in extending its overseas activities to Africa. Mr. Bourgerie began the discussion by pointing out that, due to political considerations and suspicion of American motives, it appeared unlikely that much could be done in Portuguese possessions, and perhaps to a somewhat lesser extent, in Belgian and French possessions, although in each case for slightly different reasons. Broadly the Portuguese have not favored our sending American government or private experts to Angola and Mozambique for fea

**Add the JSON file to the MongoDB as a collection**

In [None]:
db.AllPresidents.insert_many(all_presidents_collection, ordered=False)
print("Data has been exported to MongoDB server.")

Data has been exported to MongoDB server.


In [None]:
db.list_collection_names()

['AllPresidents', 'Taft']

# Run Query

In [None]:
collection = db.AllPresidents
query = {'source':'Taft'}
found = collection.find(query)

for doc in found[:1]:
    pprint.pprint(doc)

print("\n\n{:,} total documents found.".format(collection.count_documents(query)))

{'Unnamed: 0': 106432,
 '_id': ObjectId('5f70ba97660ef055f82f2ad0'),
 'context': 'The Ambassador in France ( Herrick ) to the Secretary of State '
            'American Embassy , Paris , July 28, 1914, 4 p.m. [ Received 7:30 '
            'p.m. ] [Telegram] To be communicated to the President: Situation '
            'in Europe is regarded here as the gravest in history. It is '
            'apprehended that civilization is threatened by demoralization '
            'which would follow a general conflagration. Demonstrations made '
            'against war here last night by laboring classes; it is said to be '
            'the first instance of its kind in France. It is felt that if '
            'Germany once mobilizes no backward step will be taken. France has '
            'strong reliance on her army but it is not giving way to undue '
            'excitement. There is faith and reliance on our high ideals and '
            'purposes, so that I believe expression from our nation w

# Shut down EC2 instance

In [13]:
instance.state

{'Code': 16, 'Name': 'running'}

In [None]:
instance.stop()
print('Stopping...')
i = 0
while instance.state['Name'] != 'stopped':
    print('\t{} seconds elapsed... ({})'.format(i, instance.state['Name']))
    time.sleep(5)
    i += 5
    instance = ec2.Instance(instance_id)
print('The EC2 instance has been stopped.')

Stopping...
	0 seconds elapsed... (stopping)
	5 seconds elapsed... (stopping)
	10 seconds elapsed... (stopping)


# Stanza

Go to https://stanfordnlp.github.io/stanza/depparse.html for dependency parsing to look like Stanford CoreNLP

In [None]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza





In [None]:
#stanza.download('en', '.') # download English model

In [None]:
#stanza.install_corenlp()

In [None]:
nlp = stanza.Pipeline('en', '.', processors="tokenize, pos, lemma, ner, depparse") # initialize English neural pipeline

INFO:stanza:Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [None]:
text = '''Javanese Grand Waffalo Shinzo Abesson has asked the colonial vizarate to look into the alleged spying activities on the
Javanese tribes and companies raised by the Wikileaks website in telephone talks with colonial vizar Joel Bowden
Wednesday, local media reported . On September 2, 2015, Lorien dopplemats confirmed the European Disunion has extended the sanctions imposed on Mordor
and Harad, citizens supporting pro-Elf separatists in Eastern Mordor, for a further six months .'''

parsed = nlp(text)
sentence = parsed.sentences[0]

In [None]:
parse_results = [sentence.to_dict() for sentence in parsed.sentences]
dep_strs = [sentence.dependencies_string() for sentence in parsed.sentences]

In [None]:
message = {
    "parse_results": json.dumps(parse_results),
    "dependency_strings": json.dumps(dep_strs),
}

In [None]:
def parse(doc):
    # Get document info
    text = doc["text"]
    doc_id = doc["url"]
    source = doc["website"]

    # Only keep the first 3,000 words
    if doc.num_words > 3000:
        text = ' '.join(text.split(' ')[:3000])

    # Parse document text
    parsed = nlp(text)

    # Isolate sentence results and dependency strings
    parse_results = [sentence.to_dict() for sentence in parsed.sentences]
    dep_strs = [sentence.dependencies_string() for sentence in parsed.sentences]

    # Construct message
    message = {
        "parse_results": json.dumps(parse_results),
        "dependency_strings": json.dumps(dep_strs),
    }

    return message

In [None]:
# Load a document
df = pd.read_csv('Raw Data/Combined Data/taft_df.csv').iloc[:, 1:]
doc = df.iloc[2]

text                      The Ambassador in Austria-Hungary ( Penfield )...
title                     [35] The Ambassador in Austria-Hungary ( Penfi...
url                                                                  310043
date                                                             1914-07-28
website                                                                Taft
language                                                            English
decade                                                                 1910
num_words                                                               191
num_capitalized_words                                                    48
perc_capitalized_words                                             0.251309
Name: 2, dtype: object


In [None]:
message = parse(doc)

# Stanford CoreNLP

In [None]:
!pip install stanfordcorenlp

Collecting stanfordcorenlp
  Downloading https://files.pythonhosted.org/packages/35/cb/0a271890bbe3a77fc1aca2bc3a58b14e11799ea77cb5f7d6fb0a8b4c46fa/stanfordcorenlp-3.9.1.1-py2.py3-none-any.whl
Installing collected packages: stanfordcorenlp
Successfully installed stanfordcorenlp-3.9.1.1


In [None]:
# !wget https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip https://nlp.stanford.edu/software/stanford-english-corenlp-2018-10-05-models.jar

In [None]:
# !unzip stanford-corenlp-full-2018-10-05.zip

In [None]:
# !mv stanford-english-corenlp-2018-10-05-models.jar stanford-corenlp-full-2018-10-05

In [None]:
import stanfordcorenlp
from stanfordcorenlp import StanfordCoreNLP

from nltk.tree import Tree

In [None]:
!export CORENLP_HOME="stanford-corenlp-full-2018-10-05"

In [None]:
!cd '/content/drive/My Drive/FRUS/'
!ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
nlp = StanfordCoreNLP("stanford-corenlp-full-2018-10-05", timeout=600, lang="en", memory="8g")
# nlp = StanfordCoreNLP("maxio-efs/stanford-corenlp-full-2018-10-05", timeout=600, lang="en", memory='8g')
# nlp = StanfordCoreNLP(".", timeout=600, lang="en", memory='8g')

In [None]:
nlp.annotate("I")

'{\n  "sentences": [\n    {\n      "index": 0,\n      "parse": "(ROOT\\n  (NP (PRP I)))",\n      "basicDependencies": [\n        {\n          "dep": "ROOT",\n          "governor": 0,\n          "governorGloss": "ROOT",\n          "dependent": 1,\n          "dependentGloss": "I"\n        }\n      ],\n      "enhancedDependencies": [\n        {\n          "dep": "ROOT",\n          "governor": 0,\n          "governorGloss": "ROOT",\n          "dependent": 1,\n          "dependentGloss": "I"\n        }\n      ],\n      "enhancedPlusPlusDependencies": [\n        {\n          "dep": "ROOT",\n          "governor": 0,\n          "governorGloss": "ROOT",\n          "dependent": 1,\n          "dependentGloss": "I"\n        }\n      ],\n      "openie": [\n      ],\n      "kbp": [\n      ],\n      "entitymentions": [\n      ],\n      "tokens": [\n        {\n          "index": 1,\n          "word": "I",\n          "originalText": "I",\n          "lemma": "I",\n          "characterOffsetBegin": 0,\n 

In [None]:
nlp.annotate("Barack Obama was the president.")

'{\n  "sentences": [\n    {\n      "index": 0,\n      "parse": "(ROOT\\n  (S\\n    (NP (NNP Barack) (NNP Obama))\\n    (VP (VBD was)\\n      (NP (DT the) (NN president)))\\n    (. .)))",\n      "basicDependencies": [\n        {\n          "dep": "ROOT",\n          "governor": 0,\n          "governorGloss": "ROOT",\n          "dependent": 5,\n          "dependentGloss": "president"\n        },\n        {\n          "dep": "compound",\n          "governor": 2,\n          "governorGloss": "Obama",\n          "dependent": 1,\n          "dependentGloss": "Barack"\n        },\n        {\n          "dep": "nsubj",\n          "governor": 5,\n          "governorGloss": "president",\n          "dependent": 2,\n          "dependentGloss": "Obama"\n        },\n        {\n          "dep": "cop",\n          "governor": 5,\n          "governorGloss": "president",\n          "dependent": 3,\n          "dependentGloss": "was"\n        },\n        {\n          "dep": "det",\n          "governor": 5,\n  

In [None]:
text = '''Javanese Grand Waffalo Shinzo Abesson has asked the colonial vizarate to look into the alleged spying activities on the
Javanese tribes and companies raised by the Wikileaks website in telephone talks with colonial vizar Joel Bowden
Wednesday, local media reported . On September 2, 2015, Lorien dopplemats confirmed the European Disunion has extended the sanctions imposed on Mordor
and Harad, citizens supporting pro-Elf separatists in Eastern Mordor, for a further six months .'''
annotated = nlp.annotate(text)
annotated = json.loads(annotated)

In [None]:
annotated

In [None]:
for sent in annotated["sentences"]:
    parse_str = sent["parse"]
    Tree.fromstring(parse_str).pretty_print()
    print('\n\n')

                                                                                                                                    ROOT                                                                                                                                                       
                                                                                                                                     |                                                                                                                                                          
                                                                                                                                     S                                                                                                                                                         
                   __________________________________________________________________________________________________________________|_

In [None]:
for sent in annotated["sentences"]:
    # Check that only the first 6 sentences are kept 
    if sent_counter > num_sent_to_parse:
        break;
    else:
        sent_counter += 1

    # Create header for <Sentence> element
    sent_xml = '\n\n<Sentence date = "20201019" id = "{}_{}" source = "{}" sentence = "TRUE">'.format(doc_id, sent_counter, source)
    
    # Add <Text> element
    sentence_text = ' '.join([word['originalText'] for word in sent['tokens']])
    sent_xml += "\n<Text>\n" + sentence_text + "\n</Text>"

    # Add <Parse> element
    parse_str = sent["parse"]
    sent_xml += "\n<Parse>\n" + parse_str + "\n</Parse>\n\n</Sentence>"
    
    # Add sentence to full XML string
    xml += sent_xml

In [None]:
def create_xml(doc):
    text = doc["text"][:4000]
    doc_id = doc["url"]
    source = doc["website"]

    xml = '<Sentences>'
    annotated = nlp.annotate(text)
    annotated = json.loads(annotated)
    # tree_str = nlp.parse(text)

    num_sent_to_parse = 10   # Only keep first 6 sentences
    sent_counter = 1
    for sent in annotated["sentences"]:
        # Check that only the first 6 sentences are kept 
        if sent_counter > num_sent_to_parse:
            break;
        else:
            sent_counter += 1

        # Create header for <Sentence> element
        sent_xml = '\n\n<Sentence date = "20201019" id = "{}_{}" source = "{}" sentence = "TRUE">'.format(doc_id, sent_counter, source)
        
        # Add <Text> element
        sentence_text = ' '.join([word['originalText'] for word in sent['tokens']])
        sent_xml += "\n<Text>\n" + sentence_text + "\n</Text>"

        # Add <Parse> element
        parse_str = sent["parse"]
        sent_xml += "\n<Parse>\n" + parse_str + "\n</Parse>\n\n</Sentence>"
        
        # Add sentence to full XML string
        xml += sent_xml

    # Close XML element
    xml += "\n\n</Sentences>"
    return xml    

In [None]:
# Load a document
df = pd.read_csv('Raw Data/Combined Data/taft_df.csv').iloc[:, 1:]
doc = df.iloc[2]
doc.to_json()

'{"text":"The Ambassador in Austria-Hungary ( Penfield ) to the Secretary of State American Embassy , Vienna , July 28, 1914, 7 p.m. [ Received July 29, 10:12 a.m. ] [Telegram] Minister of Foreign Affairs requests following declaration of war be cabled: To end subversive intrigues issuing from Belgrade directed against the territorial integrity Austro-Hungarian Monarchy, Imperial and Royal Government July 23 sent to Royal Servian Government note in which were formulated series of demands, for acceptation of which delay of forty-eight hours was granted Royal Government. Servian Government not having replied to note in a satisfactory manner, Imperial and Royal Government is under necessity in protecting its rights and interests, of having recourse to the force of arms. Austria-Hungary having just addressed to Servia formal declaration in conformity with Article 1 of the convention October 18, 1907, relative to opening of hostilities, therefore considers itself in a state of war with Serv

In [None]:
# Load a document
df = pd.read_csv('Raw Data/Combined Data/taft_df.csv').iloc[:, 1:]
doc = df.iloc[2]

# Parse document and get XML output
%time xml = create_xml(doc)

# Save output
with open("parsed/test.xml", "w") as f:
    f.write(xml)

# Print XML
print(xml)

CPU times: user 14.5 ms, sys: 2.26 ms, total: 16.8 ms
Wall time: 5.95 s
<Sentences>

<Sentence date = "20201019" id = "310043_2" source = "Taft" sentence = "TRUE">
<Text>
The Ambassador in Austria-Hungary ( Penfield ) to the Secretary of State American Embassy , Vienna , July 28 , 1914 , 7 p.m. [ Received July 29 , 10:12 a.m. ] [ Telegram ] Minister of Foreign Affairs requests following declaration of war be cabled : To end subversive intrigues issuing from Belgrade directed against the territorial integrity Austro-Hungarian Monarchy , Imperial and Royal Government July 23 sent to Royal Servian Government note in which were formulated series of demands , for acceptation of which delay of forty-eight hours was granted Royal Government .
</Text>
<Parse>
(ROOT
  (FRAG
    (NP
      (NP (DT The) (NNP Ambassador))
      (PP (IN in)
        (NP (NNP Austria-Hungary)))
      (PRN (-LRB- -LRB-)
        (NP (NNP Penfield))
        (-RRB- -RRB-)))
    (PP (TO to)
      (NP
        (NP (DT the) (

In [None]:
import xml.etree.ElementTree as ET

path = 'test.xml'
tree = ET.iterparse(path)
for event, elem in tree:
    print(event, elem)

end <Element 'Text' at 0x7f76bf679ef8>
end <Element 'Parse' at 0x7f76bf679f98>
end <Element 'Sentence' at 0x7f76bf7faea8>
end <Element 'Text' at 0x7f76bf683188>
end <Element 'Parse' at 0x7f76bf6830e8>
end <Element 'Sentence' at 0x7f76bf683138>
end <Element 'Text' at 0x7f76bf683368>
end <Element 'Parse' at 0x7f76bf683458>
end <Element 'Sentence' at 0x7f76bf683408>
end <Element 'Text' at 0x7f76bf683b38>
end <Element 'Parse' at 0x7f76bf683b88>
end <Element 'Sentence' at 0x7f76bf683ae8>
end <Element 'Text' at 0x7f76bf683db8>
end <Element 'Parse' at 0x7f76bf683e08>
end <Element 'Sentence' at 0x7f76bf683d68>
end <Element 'Text' at 0x7f76bf7d9048>
end <Element 'Parse' at 0x7f76bf7d9098>
end <Element 'Sentence' at 0x7f76bf683f98>
end <Element 'Sentences' at 0x7f76bf7fa818>


In [None]:
df.iloc[0].text

'The Ambassador in France ( Herrick ) to the Secretary of State American Embassy , Paris , July 28, 1914, 4 p.m. [ Received 7:30 p.m. ] [Telegram] To be communicated to the President: Situation in Europe is regarded here as the gravest in history. It is apprehended that civilization is threatened by demoralization which would follow a general conflagration. Demonstrations made against war here last night by laboring classes; it is said to be the first instance of its kind in France. It is felt that if Germany once mobilizes no backward step will be taken. France has strong reliance on her army but it is not giving way to undue excitement. There is faith and reliance on our high ideals and purposes, so that I believe expression from our nation would have great weight in this crisis. My opinion is encouraged at reception given utterances of British Minister for Foreign Affairs. I believe that a strong plea for delay and moderation from the President of the United States would meet with t

# Lambda process texts

In [None]:
cd ../..

/content/drive/.shortcut-targets-by-id/119/FRUS


## Send documents to SQS

In [None]:
ls "Raw Data/Combined Data/all_presidents_df.csv"

 all_presidents_df.csv       [0m[01;34m'Old Versions'[0m/   TaftFormatted.csv
 AllPresidentsFormatted.csv   taft_df.csv


Load all documents

In [None]:
all_presidents = pd.read_csv("Raw Data/Combined Data/all_presidents_df.csv").iloc[:, 1:]
all_presidents.head()

Unnamed: 0,id,text,title,url,date,website,language,decade,num_words,num_capitalized_words,perc_capitalized_words,batch
0,1,"Memorandum of Conversation, by the Officer in ...","[1] Memorandum of Conversation, by the Officer...",1,1952-02-20,Truman,English,1950,1232,186,0.150974,0
1,2,The Ambassador in the Union of South Africa ( ...,[2] The Ambassador in the Union of South Afric...,2,1952-03-14,Truman,English,1950,1152,202,0.175347,1
2,3,The Consul General at Salisbury ( Sims ) to th...,[3] The Consul General at Salisbury ( Sims ) t...,3,1952-03-18,Truman,English,1950,2530,306,0.120949,2
3,4,The Ambassador in Liberia ( Dudley ) to the De...,[4] The Ambassador in Liberia ( Dudley ) to th...,4,1952-01-03,Truman,English,1950,3782,411,0.108673,3
4,5,The Consul General at Salisbury ( Sims ) to th...,[5] The Consul General at Salisbury ( Sims ) t...,5,1952-06-16,Truman,English,1950,459,55,0.119826,4


Select only the documents from President Kennedy

In [None]:
president_list = ["Johnson", "Nixon-Ford"]
sub = all_presidents[all_presidents.website.isin(president_list)].sort_values(by="num_words")
print(sub.shape)
sub.head()

(29610, 12)


Unnamed: 0,id,text,title,url,date,website,language,decade,num_words,num_capitalized_words,perc_capitalized_words,batch
298499,540818,"32. Memorandum of Conversation Washington , Ja...",32. Memorandum of Conversation,540818,1965-01-22,Johnson,English,1960,27,8,0.296296,49
322958,177074,220. Telegram From the Central Intelligence Ag...,220. Telegram From the Central Intelligence Ag...,177074,1972-03-03,Nixon-Ford,English,1970,27,13,0.481481,8
322931,176804,190. Telegram From the Central Intelligence Ag...,190. Telegram From the Central Intelligence Ag...,176804,1970-02-10,Nixon-Ford,English,1970,28,15,0.535714,31
322942,1769110,201. Telegram From the Central Intelligence Ag...,201. Telegram From the Central Intelligence Ag...,1769110,1970-12-04,Nixon-Ford,English,1970,29,14,0.482759,42
298496,540518,"29. Memorandum of Conversation Washington , Ja...",29. Memorandum of Conversation,540518,1965-01-19,Johnson,English,1960,29,10,0.344828,46


In [None]:
# Loop over every document in the subset
for i in range(sub.shape[0]):
    # Print out updates every 1,000 documents
    if not (i+1) % 1000: print("Sending document {:,}/{:,}".format(i+1, sub.shape[0]))

    # Select the document
    doc = sub.iloc[i]

    # Send the document to SQS
    response = sqs.send_message(
        QueueUrl = queue_url,
        MessageBody = doc.to_json()
        )
    
    # Check that the document was sent successfully
    if response["ResponseMetadata"]["HTTPStatusCode"] != 200:
        print("Error sending document {}".format(doc.id))

print("Done")

Sending document 1,000/29,610
Sending document 2,000/29,610
Sending document 3,000/29,610
Sending document 4,000/29,610
Sending document 5,000/29,610
Sending document 6,000/29,610
Sending document 7,000/29,610
Sending document 8,000/29,610
Sending document 9,000/29,610
Sending document 10,000/29,610
Sending document 11,000/29,610
Sending document 12,000/29,610
Sending document 13,000/29,610
Sending document 14,000/29,610
Sending document 15,000/29,610
Sending document 16,000/29,610
Sending document 17,000/29,610
Sending document 18,000/29,610
Sending document 19,000/29,610
Sending document 20,000/29,610
Sending document 21,000/29,610
Sending document 22,000/29,610
Sending document 23,000/29,610
Sending document 24,000/29,610
Sending document 25,000/29,610
Sending document 26,000/29,610
Sending document 27,000/29,610
Sending document 28,000/29,610
Sending document 29,000/29,610
Done


## Download document from SQS

In [None]:
resp = sqs.receive_message(QueueUrl = queue_url, AttributeNames=['All'])
resp

{'Messages': [{'Attributes': {'ApproximateFirstReceiveTimestamp': '1605732774212',
    'ApproximateReceiveCount': '1',
    'SenderId': 'AIDAZQVU4EXF2VQFAMFUP',
    'SentTimestamp': '1605730834136'},
   'Body': '{"id":210005,"text":"2. Memorandum From the Deputy Assistant Secretary of State for African Affairs ( Fredericks ) to Secretary of State Rusk 0 Washington , August 11, 1961 . SUBJECT Regional Conference\\u2014Nicosia 1 Although Governor Williams on his return will no doubt wish to report personally on the views of our ambassadors to Northern African countries expressed during the Nicosia conference, I believe that meanwhile you will find of great interest the records of that conference here attached. One is in the form of a signed memorandum from the ambassadors to Governor Williams highlighting the main points which they wished to have emphasized for the Department\'s benefit. The other is a summary of the discussions during the conference. 2 I would like to call your attention

In [None]:
doc = json.loads(resp["Messages"][0]["Body"])
msg_receipt = resp["Messages"][0]["ReceiptHandle"]
print(doc)

{'id': 210005, 'text': "2. Memorandum From the Deputy Assistant Secretary of State for African Affairs ( Fredericks ) to Secretary of State Rusk 0 Washington , August 11, 1961 . SUBJECT Regional Conference—Nicosia 1 Although Governor Williams on his return will no doubt wish to report personally on the views of our ambassadors to Northern African countries expressed during the Nicosia conference, I believe that meanwhile you will find of great interest the records of that conference here attached. One is in the form of a signed memorandum from the ambassadors to Governor Williams highlighting the main points which they wished to have emphasized for the Department's benefit. The other is a summary of the discussions during the conference. 2 I would like to call your attention to the following points in particular: 1. The consensus of the conference was that the AFN countries are not irrevocably committed to the Soviet bloc nor are they safely aligned with the Free World. 2. The two majo

In [None]:
def parse_doc(doc):
    
    # Extract document information
    doc_id = doc["id"]
    text = doc["text"]
    title = doc["title"]
    url = doc["url"]
    decade = doc["decade"]
    date = doc["date"]
    president = doc["website"]

    # Only keep the first 3,000 words
    if doc["num_words"] > 3000:
        text = ' '.join(text.split(' ')[:3000])

    try:
        # Parse document text
        parsed = nlp(text)
        
        # Isolate sentence results and dependency strings
        parse_results = [sentence.to_dict() for sentence in parsed.sentences]
        dep_strs = [sentence.dependencies_string() for sentence in parsed.sentences]
    
    
        # Construct message
        message = {
            "id": doc_id,
            "text": text,
            "title": title,
            "url": url,
            "date": date,
            "website": president,
            
            "parse_results": json.dumps(parse_results),
            "dependency_strings": json.dumps(dep_strs),
        }

        # Create S3 key
        key = "parsed/{}/{}/{}.json".format(president, decade, doc_id)
        
        return message, key
        
    # Handle any exceptions thrown during processing
    except:
        return 'error', None

In [None]:
message, key = parse_doc(doc)
key

'parsed/Kennedy/1960/210005.json'

In [None]:
# Save output to s3
s3_resp = s3.put_object(
    Body=json.dumps(message),
    Bucket=bucket,
    Key=key
    )

if s3_resp["ResponseMetadata"]["HTTPStatusCode"] != 200:
    print("Error saving document {}".format(doc.id))
else:
    print("Saved to s3 successfully.")

{'ETag': '"efcf73f12eeb04e997871e06ac63dded"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Wed, 18 Nov 2020 20:55:11 GMT',
   'etag': '"efcf73f12eeb04e997871e06ac63dded"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'rld/iYMTxTZodgLZYUPndl8+dlWaIE+QaPIq3BA8gkTiUikG3RmpdjMFBoMJ6pqeE4LlEA/2cv4=',
   'x-amz-request-id': 'E29F619CCBA820B0'},
  'HTTPStatusCode': 200,
  'HostId': 'rld/iYMTxTZodgLZYUPndl8+dlWaIE+QaPIq3BA8gkTiUikG3RmpdjMFBoMJ6pqeE4LlEA/2cv4=',
  'RequestId': 'E29F619CCBA820B0',
  'RetryAttempts': 0}}

In [None]:
# Delete document from SQS
del_resp = sqs.delete_message(
    QueueUrl = queue_url,
    ReceiptHandle = msg_receipt
)

if del_resp["ResponseMetadata"]["HTTPStatusCode"] != 200:
    print("Error deleteing document {}".format(doc.id))
else:
    print("Removed from SQS successfully.")

Removed from SQS successfully.


## Invoke Asynchronous Lambdas

In [None]:
VERSION = 32
NUM_PROVISIONED = 10

if True:
        
    for i in range(NUM_PROVISIONED):

        # Invoke Lambda function if it parsing is not yet complete
        lambda_client.invoke(
            FunctionName='arn:aws:lambda:us-east-1:654288102859:function:frus-dev-parse:{}'.format(VERSION),
            LogType='None',
            Payload=json.dumps({}),
            InvocationType='Event'
        )
    
    print("{} lambda functions invoked.".format(i+1))
    last_time = time.time()
    last_time_str = time.ctime()
print("Done.")

10 lambda functions invoked.
Done.


In [None]:
print("It has been {1:.0f}min {0:.0f}s since invoking the Lambda functions.".format((time.time() - last_time) % 60, (time.time() - last_time) // 60))

It has been 7min 13s since invoking the Lambda functions.


In [None]:
print("The last batch of Lambda functions were invoked at {}".format(last_time_str))

The last batch of Lambda functions were invoked at Fri Nov 27 19:47:56 2020


In [None]:
docs_deleted = 0
for i in range(100):
    # Fetch document from SQS
    resp = sqs.receive_message(QueueUrl=queue_url, AttributeNames=['All'])
    doc = json.loads(resp["Messages"][0]["Body"])
    msg_receipt = resp["Messages"][0]["ReceiptHandle"]

    if doc["website"] == "Kennedy":
        print("Removing doc {}... ({} total)".format(doc["id"], docs_deleted))
        docs_deleted += 1
        del_resp = sqs.delete_message(
                    QueueUrl = queue_url,
                    ReceiptHandle = msg_receipt
                )
    else:
        continue

# Lambda function

In [6]:
import os, sys
sys.path.insert(1, '/mnt/efs/')
print("Mounted.")

# import pandas as pd
# pd.read_excel("https://github.com/DesiPilla/college-emails/blob/master/CollegeEmails.xlsx?raw=true")
# print("Connected to internet.")

import time

import io
import re
import json
import boto3
import psutil
import logging
import pandas as pd
print("Imported.")

aws_id, aws_secret_key = pd.read_csv('DesiPilla_accessKeys.csv').values[0]
print("Keys loaded.")

# bucket = os.environ["bucket"]
# queue_name = os.environ["queueName"]
# queue_url = os.environ["queueUrl"]
bucket = "frus-corenlp"
queue_name = "frus-unparsed"
queue_url = "https://sqs.us-east-1.amazonaws.com/654288102859/frus-unparsed"


# s3 = boto3.client('s3')
# s3_resource = boto3.resource('s3')
s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret_key, region_name='us-east-1')
s3_resource = boto3.resource('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret_key, region_name='us-east-1')
sqs = boto3.client('sqs', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret_key, region_name='us-east-1')
print("s3 and sqs connected.")

Mounted.
Imported.
Keys loaded.
s3 and sqs connected.


In [8]:
# Load English neural pipeline
!pip install stanza
import stanza
nlp = stanza.Pipeline('en', processors="tokenize, pos, lemma, ner, depparse") 
print("CoreNLP loaded.")



Exception: Resources file not found at: C:\Users\desid\stanza_resources\resources.json. Try to download the model again.

In [11]:
def parse_doc(doc):
    
    # Extract document information
    doc_id = doc["id"]
    text = doc["text"]
    title = doc["title"]
    url = doc["url"]
    decade = doc["decade"]
    date = doc["date"]
    president = doc["website"]

    # Only keep the first 1,600 words
    words_to_keep = 1600
    if doc["num_words"] > words_to_keep:
        print("\tTruncating document")
        matches = [i.start() for i in re.finditer(' ', text)]
        text = text[:matches[words_to_keep - 1]]

    try:
        # Parse document text
        parsed = nlp(text)
        
        # Isolate sentence results and dependency strings
        parse_results = [sentence.to_dict() for sentence in parsed.sentences]
        dep_strs = [sentence.dependencies_string() for sentence in parsed.sentences]
    
    
        # Construct message
        message = {
            "id": doc_id,
            "text": text,
            "title": title,
            "url": url,
            "date": date,
            "website": president,
            
            "parse_results": json.dumps(parse_results),
            "dependency_strings": json.dumps(dep_strs),
        }

        # Create S3 key
        key = "parsed/{}/{}/{}.json".format(president, decade, doc_id)
        
        return message, key
        
    # Handle any exceptions thrown during processing
    except:
        return 'error', None




def handler(event, context):
    start_time = time.time()
    print("Enter handler.")

    # Initialize progress counters
    successes = 0
    errors = 0

    # Keep parsing documents one at a time for 1 minutes
    counter = 0
    while time.time() - start_time < 1800:
        counter += 1
        
        try:
            # Fetch document from SQS
            resp = sqs.receive_message(QueueUrl=queue_url, AttributeNames=['All'])
            doc = json.loads(resp["Messages"][0]["Body"])
            msg_receipt = resp["Messages"][0]["ReceiptHandle"]
            print("Parsing doc {} ({} words, id = {})".format(counter, doc["num_words"], doc["url"]))
        except:
            print("Empty message recevied.")
            continue
        
        # Parse document and get JSON output
        doc_start = time.time()
        message, key = parse_doc(doc)
        print("\tParsed in {:.1f} seconds.".format(time.time() - doc_start))
    
        # If no error was thrown
        if key:
            # Save output to s3
            s3_resp = s3.put_object(
                Body=json.dumps(message),
                Bucket=bucket,
                Key=key
                )

            # Check that results were saved to s3 successfully
            if s3_resp["ResponseMetadata"]["HTTPStatusCode"] != 200:
                print(s3_resp)
                errors += 1
                del message
                del resp
                del doc
                del msg_receipt
                continue

            # Delete document from SQS
            del_resp = sqs.delete_message(
                QueueUrl = queue_url,
                ReceiptHandle = msg_receipt
            )
            
            # Check that document was deleted from SQS successfully
            if del_resp["ResponseMetadata"]["HTTPStatusCode"] != 200:
                print(del_resp)
                errors += 1
                del message
                del resp
                del doc
                del msg_receipt
                continue
            
            successes += 1
        else: 
            print(key)
            print(message)
            errors += 1
            del message
            del resp
            del doc
            del msg_receipt
            continue
        
        # print("{:.2f} MB available".format(psutil.virtual_memory().available * 1e-6))
        # Delete message from memory to cut costs
        del message
        del resp
        del doc
        del msg_receipt

    return {
        "message": "{:,} documents were processed.\n{:,} were parsed successfuly and {:,} threw errors.".format(successes + errors, successes, errors)
    }

In [12]:
event, context = {}, {}
handler(event, context)

Enter handler.
Parsing doc 1 (1814 words, id = 921810)
	Truncating document
	Parsed in 16.1 seconds.
Parsing doc 2 (1840 words, id = 152006)
	Truncating document
	Parsed in 20.3 seconds.
Parsing doc 3 (1860 words, id = 85498)
	Truncating document
	Parsed in 19.5 seconds.
Parsing doc 4 (1719 words, id = 397316)
	Truncating document
	Parsed in 16.6 seconds.
Parsing doc 5 (1879 words, id = 87787)
	Truncating document
	Parsed in 22.8 seconds.
Parsing doc 6 (1820 words, id = 102088)
	Truncating document
	Parsed in 16.0 seconds.
Parsing doc 7 (1712 words, id = 105239)
	Truncating document
	Parsed in 16.2 seconds.
Parsing doc 8 (1865 words, id = 95940)
	Truncating document
	Parsed in 15.7 seconds.
Parsing doc 9 (1726 words, id = 157897)
	Truncating document
	Parsed in 21.4 seconds.
Parsing doc 10 (1865 words, id = 114029)
	Truncating document
	Parsed in 16.7 seconds.
Parsing doc 11 (1856 words, id = 700717)
	Truncating document
	Parsed in 31.3 seconds.
Parsing doc 12 (1874 words, id = 596819)

{'message': '95 documents were processed.\n95 were parsed successfuly and 0 threw errors.'}