### Load Pickled Data

In [17]:
from pathlib import Path
import pandas as pd

df = pd.read_pickle(Path.cwd() / "pickles" /"parsed.pickle")

### Scratched/Tests scripts

In [None]:
## Get column names
df.columns
# initialise dictionary
adj_dict = {}
# Loops thru each row in the df and gets the spacy doc
for index, row in df.iterrows():
    print("-"*30)
    print(row['title'])
    doc = row['parsed']
    # Loop thru each word of each spacy doc
    for token in doc:
        if token.pos_ == "ADJ":
            adj = token.lemma_
            adj_dict[adj] = adj_dict.get(adj, 0) + 1

# Convert dictionary to lsit of tuples
adj_list = list(adj_dict.items())


print(adj_list)

#### Test subject by verb count

In [None]:
# Test subject_by_verb_count
import importlib
import PartOne as po

importlib.reload(po)


df_mini = df.iloc[[0]]
#print(df_mini)

for i, row in df.iterrows():
    print(row['title'])
    for pair in po.subjects_by_verb_count(row['parsed'], 'run'):
        print(f"\t {pair}")


### Developing Pointwise mutual information

Use only 1 row of the data frame for development only

In [95]:
import spacy

df_mini = df.iloc[0]

data = [{'title': "dummy_text", 'text': 'The cat hears a mouse. The dog hears a noise. The cat hears the dog.'}]

df_mini = pd.DataFrame(data)

# load your model

nlp = spacy.load("en_core_web_sm")

# add parse

df_mini['parsed'] = df_mini['text'].apply(nlp)
df_mini = df_mini.iloc[0]

print(df_mini.head())


title                                            dummy_text
text      The cat hears a mouse. The dog hears a noise. ...
parsed    (The, cat, hears, a, mouse, ., The, dog, hears...
Name: 0, dtype: object


#### Create the Subject-Verb pairs

In [96]:
# count of co occurrence
verb = 'hear'
verb_subject = po.subjects_by_verb_count(df_mini['parsed'], verb)

# for doc in df_mini['parsed']:
#     verb_subject = (po.subjects_by_verb_count(doc, "hear"))
print(df_mini['title'])
print(f"1. Counts of Verbs and Subjects:\n\t{verb_subject}\n")

dummy_text
1. Counts of Verbs and Subjects:
	[{('hear', 'cat'): 2}, {('hear', 'dog'): 1}]



#### Extract unique words from the S-V pairs

In [None]:
# Extract unique objects
unique_w = set()
for d in verb_subject:
    for k in d.keys():
        for ind in k:
            unique_w.add(ind)



print(f"2. Extract unique words for total counts: \n\t{unique_w}\n")

2. Extract unique words for total counts: 
	{'cat', 'hear', 'dog'}



#### Count total words, and count of individual words

In [None]:
# Count word occurrence in whole document, and total tokens in doc
total_words = 0
# Create and add keys to count the expected words
total_existing = {w: 0 for w in unique_w}

for token in df_mini['parsed']:

    if token.text.isalpha():
        total_words += 1
        word_lemma = token.lemma_
        if word_lemma in total_existing:
            total_existing[word_lemma] += 1
print(f"3. Total words and total count for existing words: \n\tTotal words: {total_words}\n\tTotal count for existing:\n\t{total_existing}")
       

the
cat
hear
a
mouse
the
dog
hear
a
noise
the
cat
hear
the
dog
3. Total words and total count for existing words: 
	Total words: 15
	Total count for existing:
	{'cat': 2, 'hear': 3, 'dog': 2}


#### Calculate PPMI for the pair of the top 10 S-V pairs

In [99]:
import math
# Probability of the verb (context)
p_c = total_existing[verb]/total_words
ppmi_dict = {}
# Loop over the dictionary of the 10 s-v pairs
for d in verb_subject:
    for key, value in d.items():
        # probability of the pair verb-subject
        p_wc = value/total_words
        # Probability of the subject (word)
        p_w = total_existing[key[1]]/total_words
        # Calculate PMI
        pmi = p_wc/(p_w * p_c)
        pmi = math.log2(pmi)
        ppmi = max(pmi,0)
        print(f"Key: {key}, PPMI: {round(ppmi,3)}")
        # Add value to final dict
        ppmi_dict[key] = ppmi_dict.get(key, round(ppmi,3))
# Sort final dictionary
ppmi_dict = sorted(ppmi_dict.items(), key = lambda item: item[1], reverse = True)

print(ppmi_dict)

Key: ('hear', 'cat'), PPMI: 2.322
Key: ('hear', 'dog'), PPMI: 1.322
[(('hear', 'cat'), 2.322), (('hear', 'dog'), 1.322)]
