In [40]:
import nltk
import spacy
from pathlib import Path
import pandas as pd
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import pickle
from collections import Counter
import math




nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000
                             
        
                             
    

In [41]:
def read_novels(path=Path.cwd() / "p1-texts" / "novels"):
    data = []
    for file in path.glob("*.txt"):
        title_name = file.stem
        parts = title_name.split("-")

        if len(parts) >= 3:
            title = ' '.join(part.replace('_', ' ') for part in parts[:-2])
            author = parts[-2].replace('_', ' ')
            year = parts[-1]

            with file.open(encoding='utf-8') as f:
                text = f.read()

            data.append({
                'text': text,
                'title': title,
                'author': author,
                'year': int(year)  
            })

    df = pd.DataFrame(data)

    df = df.sort_values('year').reset_index(drop=True)

    return df

In [42]:
def nltk_ttr(text):
    tokens = word_tokenize(text)

    words = [token.lower() for token in tokens if token.isalpha()]
    if not words:
        return 0

    types = set(words)
    ttr = len(types) / len(words)
    return ttr    

In [43]:
def get_ttrs(df):
    """helper function to add ttr to a dataframe"""
    results = {}
    for i, row in df.iterrows():
        results[row["title"]] = nltk_ttr(row["text"])
    return results

In [44]:
def count_syl(word, d):
    word = word.lower()
    if word in d:
        return len([pron for pron in d[word][0] if pron[-1].isdigit()])
    
    vowels = 'aeiouy'
    syllable_count = 0
    prev_letter_was_vowel = False

    for letter in word:
        if letter in vowels and not prev_letter_was_vowel:
            syllable_count += 1
            prev_letter_was_vowel = True
        else:
            prev_letter_was_vowel = False

    return max(1, syllable_count)

In [45]:
def fk_level(text, d):
    sentences = sent_tokenize(text)
    words = []
    for sentence in sentences:
        words.extend([word.lower() for word in word_tokenize(sentence) if word not in string.punctuation and word.isalpha()])
        
    if not words or not sentences:
        return 0

    total_syllables = sum(count_syl(word, d) for word in words)
    total_words = len(words)
    total_sentences = len(sentences)

    fk_score = (0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59)

    return fk_score

In [46]:
def get_fks(df):
    """helper function to add fk scores to a dataframe"""
    results = {}
    cmudict = nltk.corpus.cmudict.dict()
    for i, row in df.iterrows():
        results[row["title"]] = round(fk_level(row["text"], cmudict), 4)
    return results

In [47]:
def parse(df, store_path=Path.cwd() / "pickle", out_name="parsed.pickle"):
    store_path.mkdir(parents=True, exist_ok=True)
    output_file = store_path / out_name

    df['doc'] = df['text'].apply(lambda x: nlp(x))

    with open(output_file, 'wb') as f:
        pickle.dump(df, f)
    print(f"Saved parsed DataFrame to {output_file}")

    return df

In [51]:
print(os.listdir(path.cwd() / "pickle"))

['parsed.pickle']


In [53]:
df = pd.read_pickle(Path.cwd() / "pickle" / "parsed.pickle")
print(df.columns)

Index(['text', 'title', 'author', 'year', 'doc'], dtype='object')


In [54]:
def adjective_counts(doc):
    adjectives = [token.text.lower() for token in doc if token.pos_ == "ADJ"]

    return Counter(adjectives).most_common(10)

In [55]:
def subjects_by_verb_count(doc, verb):
    subject_counter = Counter()

    for token in doc:
        if token.lemma_.lower() == verb and token.pos_ == "VERB":
            subject_counter.update(
                child.lemma_lower()
                for child in token.children
                if child.dep_ == "nsubj"
            )

    return subject_counter.most_common(10)
        

In [56]:
def subjects_by_verb_pmi(doc, target_verb):
    verb = target_verb.lower()

    subj_verb_counts = Counter()
    subj_counts = Counter()
    verb_counts = 0

    for token in doc:
        if token.pos_ == "VERB" and token.lemma_.lower() == verb:
            verb_counts += 1
            for child in token.children:
                if child.dep_ == "nsubj":
                    subj = child.lemma_.lower()
                    subj_verb_counts[subj] += 1

    for token in doc:
        if token.pos_ in ("NOUN", "PROPN", "PRON"):
            subj_counts[token.lemma_.lower()] += 1

    total_tokens = len(doc)

    pmi_scores = {}
    for subj in subj_verb_counts:
        p_subj_verb = subj_verb_counts[subj] / total_tokens
        p_subj = subj_counts[subj] / total_tokens
        p_verb = verb_counts / total_tokens
        if p_subj > 0 and p_verb > 0 and p_subj_verb > 0:
            pmi = math.log2(p_subj_verb / (p_subj * p_verb))
            pmi_scores[subj] = pmi

    return sorted(pmi_scores.items(), key=lambda x: x[1], reverse=True)[:10]
    

In [57]:
if __name__ == "__main__":
    """
    uncomment the following lines to run the functions once you have completed them
    """
    path = Path.cwd() / "p1-texts" / "novels"
    print(path)
    df = read_novels(path) # this line will fail until you have completed the read_novels function above.
    print(df.head())
    nltk.download("cmudict")
    nltk.download("punkt")
    #parse(df)
    #print(df.head())
    print(get_ttrs(df))
    print(get_fks(df))
    df = pd.read_pickle(Path.cwd() / "pickle" /"parsed.pickle")
    
    for i, row in df.iterrows():
        print(row["title"])
        print(adjective_counts(row["doc"]))
        print("\n")       
    
    for i, row in df.iterrows():
        print(row["title"])
        print(subjects_by_verb_count(row["doc"], "hear"))
        print("\n")
    
    for i, row in df.iterrows():
        print(row["title"])
        print(subjects_by_verb_pmi(row["doc"], "hear"))
        print("\n")
    

C:\Users\zaman\OneDrive\nlp-coursework-2024-25-zamanfa-1-main\nlp-coursework-2024-25-zamanfa-1-1\p1-texts\novels
                                                text                  title  \
0  \nCHAPTER 1\n\nThe family of Dashwood had long...  Sense and Sensibility   
1  'Wooed and married and a'.'\n'Edith!' said Mar...        North and South   
2  Book the First--Recalled to Life\n\n\n\n\nI. T...   A Tale of Two Cities   
3  SAMUEL BUTLER.\nAugust 7, 1901\n\nCHAPTER I: W...                Erewhon   
4  THE AMERICAN\n\nby Henry James\n\n\n1877\n\n\n...           The American   

    author  year  
0   Austen  1811  
1  Gaskell  1855  
2  Dickens  1858  
3   Butler  1872  
4    James  1877  


[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\zaman\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zaman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'Sense and Sensibility': 0.052847302442989776, 'North and South': 0.0549040694681204, 'A Tale of Two Cities': 0.07072694469399422, 'Erewhon': 0.09151270564132943, 'The American': 0.06381607058523676, 'Dorian Gray': 0.08355234620193412, 'Tess of the DUrbervilles': 0.07778957979554696, 'The Golden Bowl': 0.047475476259872806, 'The Secret Garden': 0.05847231570812455, 'Portrait of the Artist': 0.10472745625841184, 'The Black Moth': 0.07866588875923765, 'Orlando': 0.1137245917497168, 'Blood Meridian': 0.08568897067593587}
{'Sense and Sensibility': 10.898, 'North and South': 6.6594, 'A Tale of Two Cities': 9.8466, 'Erewhon': 14.6827, 'The American': 7.9951, 'Dorian Gray': 4.9526, 'Tess of the DUrbervilles': 7.6353, 'The Golden Bowl': 12.4474, 'The Secret Garden': 4.6623, 'Portrait of the Artist': 6.454, 'The Black Moth': 4.2235, 'Orlando': 9.5473, 'Blood Meridian': 5.6416}
Sense and Sensibility
[('own', 267), ('such', 202), ('more', 196), ('other', 185), ('good', 153), ('great', 148), ('li

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'lemma_lower'