## Part One

### Load Pickled Data

In [13]:
from pathlib import Path
import pandas as pd

df = pd.read_pickle(Path.cwd() / "pickles" /"parsed.pickle")

### Scratched/Tests scripts

In [None]:
## Get column names
df.columns
# initialise dictionary
adj_dict = {}
# Loops thru each row in the df and gets the spacy doc
for index, row in df.iterrows():
    print("-"*30)
    print(row['title'])
    doc = row['parsed']
    # Loop thru each word of each spacy doc
    for token in doc:
        if token.pos_ == "ADJ":
            adj = token.lemma_
            adj_dict[adj] = adj_dict.get(adj, 0) + 1

# Convert dictionary to lsit of tuples
adj_list = list(adj_dict.items())


print(adj_list)

#### Test subject by verb count

In [None]:
# Test subject_by_verb_count
import importlib
import PartOne as po

importlib.reload(po)


df_mini = df.iloc[[0]]
#print(df_mini)

for i, row in df.iterrows():
    print(row['title'])
    for pair in po.subjects_by_verb_count(row['parsed'], 'run'):
        print(f"\t {pair}")


### Developing Pointwise mutual information

Use only 1 row of the data frame for development only

In [3]:
import spacy

df_mini = df.iloc[0]

data = [{'title': "dummy_text", 'text': 'The cat hears a mouse. The dog hears a noise. The cat hears the dog.'}]

df_mini = pd.DataFrame(data)

# load your model

nlp = spacy.load("en_core_web_sm")

# add parse

df_mini['parsed'] = df_mini['text'].apply(nlp)
df_mini = df_mini.iloc[0]

print(df_mini.head())


title                                            dummy_text
text      The cat hears a mouse. The dog hears a noise. ...
parsed    (The, cat, hears, a, mouse, ., The, dog, hears...
Name: 0, dtype: object


#### Create the Subject-Verb pairs

In [96]:
# count of co occurrence
verb = 'hear'
verb_subject = po.subjects_by_verb_count(df_mini['parsed'], verb)

# for doc in df_mini['parsed']:
#     verb_subject = (po.subjects_by_verb_count(doc, "hear"))
print(df_mini['title'])
print(f"1. Counts of Verbs and Subjects:\n\t{verb_subject}\n")

dummy_text
1. Counts of Verbs and Subjects:
	[{('hear', 'cat'): 2}, {('hear', 'dog'): 1}]



#### Extract unique words from the S-V pairs

In [None]:
# Extract unique objects
unique_w = set()
for d in verb_subject:
    for k in d.keys():
        for ind in k:
            unique_w.add(ind)



print(f"2. Extract unique words for total counts: \n\t{unique_w}\n")

2. Extract unique words for total counts: 
	{'cat', 'hear', 'dog'}



#### Count total words, and count of individual words

In [None]:
# Count word occurrence in whole document, and total tokens in doc
total_words = 0
# Create and add keys to count the expected words
total_existing = {w: 0 for w in unique_w}

for token in df_mini['parsed']:

    if token.text.isalpha():
        total_words += 1
        word_lemma = token.lemma_
        if word_lemma in total_existing:
            total_existing[word_lemma] += 1
print(f"3. Total words and total count for existing words: \n\tTotal words: {total_words}\n\tTotal count for existing:\n\t{total_existing}")
       

the
cat
hear
a
mouse
the
dog
hear
a
noise
the
cat
hear
the
dog
3. Total words and total count for existing words: 
	Total words: 15
	Total count for existing:
	{'cat': 2, 'hear': 3, 'dog': 2}


#### Calculate PPMI for the pair of the top 10 S-V pairs

In [99]:
import math
# Probability of the verb (context)
p_c = total_existing[verb]/total_words
ppmi_dict = {}
# Loop over the dictionary of the 10 s-v pairs
for d in verb_subject:
    for key, value in d.items():
        # probability of the pair verb-subject
        p_wc = value/total_words
        # Probability of the subject (word)
        p_w = total_existing[key[1]]/total_words
        # Calculate PMI
        pmi = p_wc/(p_w * p_c)
        pmi = math.log2(pmi)
        ppmi = max(pmi,0)
        print(f"Key: {key}, PPMI: {round(ppmi,3)}")
        # Add value to final dict
        ppmi_dict[key] = ppmi_dict.get(key, round(ppmi,3))
# Sort final dictionary
ppmi_dict = sorted(ppmi_dict.items(), key = lambda item: item[1], reverse = True)

print(ppmi_dict)

Key: ('hear', 'cat'), PPMI: 2.322
Key: ('hear', 'dog'), PPMI: 1.322
[(('hear', 'cat'), 2.322), (('hear', 'dog'), 1.322)]


### Counting all syntactic objects

Understanding dep_ tags

In [18]:
import spacy

nlp = spacy.load("en_core_web_sm")

print(nlp.pipe_labels['parser'])


['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']


In [14]:
synt_obj = set()

for doc in df['parsed']:
    for token in doc:
        synt_obj.add(token.dep_)

print('-----------')
print(synt_obj)


-----------
{'nsubjpass', 'nmod', 'compound', 'prep', 'oprd', 'dative', 'conj', 'attr', 'preconj', 'mark', 'relcl', 'ccomp', 'nsubj', 'det', 'appos', 'auxpass', 'pcomp', 'dep', 'parataxis', 'advmod', 'quantmod', 'advcl', 'neg', 'acl', 'punct', 'prt', 'intj', 'dobj', 'case', 'predet', 'acomp', 'pobj', 'npadvmod', 'xcomp', 'ROOT', 'csubj', 'aux', 'cc', 'csubjpass', 'agent', 'meta', 'nummod', 'expl', 'amod', 'poss'}


In [17]:
for t in synt_obj:

    exp = spacy.explain(t)

    #if exp is not None and "object" in exp:
    if 'obj' in t:
        print("*"*50)
        print(f"{t}: {exp}")
        print("*"*50)
    else:
        print("-"*20)
        print(f"{t}: {exp}")
        print("-"*20)

--------------------
nsubjpass: nominal subject (passive)
--------------------
--------------------
nmod: modifier of nominal
--------------------
--------------------
compound: compound
--------------------
--------------------
prep: prepositional modifier
--------------------
--------------------
oprd: object predicate
--------------------
--------------------
dative: dative
--------------------
--------------------
conj: conjunct
--------------------
--------------------
attr: attribute
--------------------
--------------------
preconj: pre-correlative conjunction
--------------------
--------------------
mark: marker
--------------------
--------------------
relcl: relative clause modifier
--------------------
--------------------
ccomp: clausal complement
--------------------
--------------------
nsubj: nominal subject
--------------------
--------------------
det: determiner
--------------------
--------------------
appos: appositional modifier
--------------------
--------------

Counting all syntactic objects loop

In [None]:
syntatic_object = {}
none_count = {}
for doc in df['parsed']:
    for token in doc:
        # Extract the type of dependency, explained
        dep_explained = spacy.explain(token.dep_)
        if dep_explained is None:
            word_lemma = token.lemma_
            none_count[word_lemma] = none_count.get(word_lemma, 0) + 1
        elif "object" in dep_explained:
            # Lemmatize word
            word_lemma = token.lemma_
            syntatic_object[word_lemma] = syntatic_object.get(word_lemma, 0) + 1

print(f"========= With Dependencies =========")
for key, value in syntatic_object.items():
    print(f"Key {key} Value {value}")

print(f"\n========= Without Dependencies (None) =========")
for key, value in none_count.items():
    print(f"Key {key} Value {value}")

Test that logic works

In [19]:
import importlib
import PartOne as po
import spacy
importlib.reload(po)

text = 'the cat chased the mouse and caught the mouse, but the dog chased the cat and found it in the garden.'

nlp = spacy.load("en_core_web_sm")
text_nlp = nlp(text)
object_tags = ['dobj', 'iobj', 'oprd', 'obj', 'pobj']
syntatic_object = {}

for token in text_nlp:
    # Extract the type of dependency, explained
    dep_tag = token.dep_
    if dep_tag in object_tags:
        # Lemmatize word
        word_lemma = token.lemma_
        syntatic_object[word_lemma] = syntatic_object.get(word_lemma, 0) + 1

print(syntatic_object)

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/enmanuelmoreno/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


{'mouse': 2, 'cat': 1, 'it': 1, 'garden': 1}


In [33]:
import importlib
import PartOne as po

importlib.reload(po)

for i, row in df.iterrows():
    print(row['title'])
    print(po.count_obj(row['parsed']))

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/enmanuelmoreno/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


Sense_and_Sensibility
[{'it': 706}, {'she': 686}, {'he': 517}, {'they': 405}, {'I': 340}, {'you': 330}, {'which': 251}, {'what': 210}, {'time': 204}, {'herself': 192}]
North_and_South
[{'she': 884}, {'it': 878}, {'he': 815}, {'I': 587}, {'you': 443}, {'which': 415}, {'what': 389}, {'they': 370}, {'time': 319}, {'Margaret': 242}]
A_Tale_of_Two_Cities
[{'he': 898}, {'it': 859}, {'you': 436}, {'I': 430}, {'they': 371}, {'she': 349}, {'hand': 295}, {'which': 222}, {'time': 201}, {'himself': 186}]
Erewhon
[{'which': 415}, {'I': 373}, {'it': 358}, {'they': 307}, {'he': 185}, {'time': 135}, {'that': 121}, {'one': 109}, {'what': 108}, {'we': 101}]
The_American
[{'it': 876}, {'you': 721}, {'he': 712}, {'I': 657}, {'she': 565}, {'what': 326}, {'that': 289}, {'they': 275}, {'Newman': 237}, {'hand': 232}]
Dorian_Gray
[{'he': 607}, {'it': 472}, {'I': 439}, {'you': 356}, {'that': 261}, {'life': 193}, {'they': 187}, {'what': 180}, {'she': 178}, {'thing': 131}]
Tess_of_the_DUrbervilles
[{'she': 862}, 

In [10]:
import importlib
import PartOne as po
import spacy
importlib.reload(po)

text = 'the cat chased the mouse and caught the mouse, but the dog chased the cat and found it in the garden.'

nlp = spacy.load("en_core_web_sm")
text_nlp = nlp(text)

count_dict = po.count_obj(text_nlp)

print(count_dict)

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/enmanuelmoreno/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


[{'mouse': 2}, {'cat': 1}, {'it': 1}, {'garden': 1}]


## Part Two

### Load Data

In [None]:
import PartTwo as pt
import importlib
importlib.reload(pt)

speeches_df = pt.read_csv()
speeches_df.head(2)


Unnamed: 0,speech,party,constituency,date,speech_class,major_heading,year,speakername
0,"Unemployment is soaring, uptake in benefits ha...",Labour,Portsmouth South,2020-09-14,Speech,Work and Pensions,2020,Stephen Morgan
1,I thank the hon. Gentleman for raising issues ...,Conservative,Mid Sussex,2020-09-14,Speech,Work and Pensions,2020,Mims Davies
