In [None]:
#Re-assessment template 2025

# Note: The template functions here and the dataframe format for structuring your solution is a suggested but not mandatory approach. You can use a different approach if you like, as long as you clearly answer the questions and communicate your answers clearly.

import nltk
import spacy
import pandas as pd
import re
import pickle


from pathlib import Path
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation
from nltk.corpus import cmudict
from collections import Counter


nltk.download('punkt')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to /Users/jimena/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /Users/jimena/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [50]:
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000

In [65]:
def read_novels(path=Path.cwd() / "p1-texts" / "novels"):
    """Reads texts from a directory of .txt files and returns a DataFrame with the text, title,
    author, and year"""
    novels_dataset = []

    for file in path.glob("*.txt"):
        name = file.stem
        title, author, year = name.split("-")

        with open(file, encoding="utf-8") as txt:
            text = txt.read()

        novels_dataset.append({
            "text": text,
            "title": title,
            "author": author,
            "year": year
        })
    df = pd.DataFrame(novels_dataset, columns=["text", "title", "author", "year"])

    df = df.sort_values(by="year").reset_index(drop=True)

    return df

# df = read_novels()
# print(df)

In [52]:
def nltk_ttr(text):
    """Calculates the type-token ratio of a text. Text is tokenized using nltk.word_tokenize."""
    ttr_dict = {}

    for _, row in df.iterrows():
        text = row['text']
        title = row['title']

        tokens = word_tokenize(text)

        tokens = [token.lower() for token in tokens if token not in punctuation]

        types = set(tokens)
        ttr = len(types) / len(tokens)

        ttr_dict[title] = ttr

    return ttr_dict

df = read_novels()
ttr_by_novel = nltk_ttr(df)
print(ttr_by_novel)
#TTR sorted in descending order. Shows grade of lexical diversity within a text.

sorted_ttr = dict(sorted(ttr_by_novel.items(), key=lambda item: item[1], reverse=True))
print(sorted_ttr)


{'Sense_and_Sensibility': 0.05850947122065384, 'North_and_South': 0.06580904022624899, 'A_Tale_of_Two_Cities': 0.07287318093056312, 'Erewhon': 0.09698006527621059, 'The_American': 0.06756581683792795, 'Dorian_Gray': 0.08498209005457125, 'Tess_of_the_DUrbervilles': 0.08015054009113641, 'The_Golden_Bowl': 0.04813433820231636, 'The_Secret_Garden': 0.056573232870154, 'Portrait_of_the_Artist': 0.10970434538631502, 'The_Black_Moth': 0.07621856866537717, 'Orlando': 0.11753228191839728, 'Blood_Meridian': 0.08562941459421598}
{'Orlando': 0.11753228191839728, 'Portrait_of_the_Artist': 0.10970434538631502, 'Erewhon': 0.09698006527621059, 'Blood_Meridian': 0.08562941459421598, 'Dorian_Gray': 0.08498209005457125, 'Tess_of_the_DUrbervilles': 0.08015054009113641, 'The_Black_Moth': 0.07621856866537717, 'A_Tale_of_Two_Cities': 0.07287318093056312, 'The_American': 0.06756581683792795, 'North_and_South': 0.06580904022624899, 'Sense_and_Sensibility': 0.05850947122065384, 'The_Secret_Garden': 0.05657323287

In [None]:
cmu_dict = cmudict.dict()

def dict_of_syl():
    """Returns a dictionary mapping words to their syllable counts based on the CMU dictionary"""
    cmu = cmudict.dict()
    syl_dict = {}

    for word, pronunciations in cmu.items():
        # Vowel sounds in CMU have numbers at the end. If that is True count 1 syllable. 
        syllable_counts = [sum(phoneme[-1].isdigit() for phoneme in pron)
        for pron in pronunciations]
        # A word may have multiple pronunciations. Get the simplest form.
        syl_dict[word.lower()] = min(syllable_counts)

    return syl_dict

d = dict_of_syl()

# print(d['novels'])


2


In [None]:
def count_syl(word, d):
    """Counts the number of syllables in a word given a dictionary of syllables per word.
    if the word is not in the dictionary, syllables are estimated by counting vowel clusters

    Args:
        word (str): The word to count syllables for.
        d (dict): A dictionary of syllables per word.

    Returns:
        int: The number of syllables in the word.
    """
    word = word.lower()

    if word in d:
        return d[word]

    # Syllables by counting vowel clusters. 'Y' can sometimes act as a vowel in English
    syl_by_vowels = len(re.findall(r'[aeiouy]+', word))

    return syl_by_vowels

In [68]:
def fk_level(text, d):
    """Returns the Flesch-Kincaid Grade Level of a text (higher grade is more difficult).
    Requires a dictionary of syllables per word.

    Args:
        text (str): The text to analyze.
        d (dict): A dictionary of syllables per word.

    Returns:
        float: The Flesch-Kincaid Grade Level of the text. (higher grade is more difficult)
    """
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    # Filter out punctuation tokens
    words = [word for word in words if re.search(r'\w', word)]

    num_sentences = len(sentences)
    num_words = len(words)
    # Uses the count_syl function
    num_syllables = sum(int(count_syl(word, d)) for word in words)


    # According to the Flesch-Kincaid Grade Level formula
    fk_grade = 0.39 * (num_words / num_sentences) + 11.8 * (num_syllables / num_words) - 15.59
    return fk_grade

# text = "This is an example of the Flesch-Kincaid Grade Level. It estimates the school grade needed to understand a text."
# fk = fk_level(text, d)
# print(fk)

In [56]:
def flesch_kincaid(df, d):
    """Returns a dictionary mapping novel titles to their Flesch-Kincaid grade level.
    Args:
        df: DataFrame including 'title' and 'text' columns.
        d: Dictionary of syllables per word.
    Returns:
        dict: {title: flesch_kincaid_grade}
    """
    fk_grade = {}

    for _, row in df.iterrows():
        title = row['title']
        text = row['text']
        fk_grade[title] = fk_level(text, d)

    return fk_grade

""""
Example:
df = read_novels()
d = dict_of_syl()
fk_grades = flesch_kincaid(df, d)
# Sorted by highest to lowest grade
for title, grade in sorted(fk_grades.items(), key=lambda x: x[1], reverse=True):
    print(f"{title}: {grade:.2f}")
"""

'"\nExample:\ndf = read_novels()\nd = dict_of_syl()\nfk_grades = flesch_kincaid(df, d)\n# Sorted by highest to lowest grade\nfor title, grade in sorted(fk_grades.items(), key=lambda x: x[1], reverse=True):\n    print(f"{title}: {grade:.2f}")\n'

In [57]:
# Check the longest text length in our dataset
df["text_length"] = df["text"].apply(len)
max_length = df["text_length"].max()
print(f"Maximun text length: {max_length} characters")


Maximun text length: 1158935 characters


In [58]:
def parse(df, store_path=Path.cwd() / "pickles", out_name="parsed.pickle"):
    """Parses the text of a DataFrame using spaCy, stores the parsed docs as a column and writes 
    the resulting  DataFrame to a pickle file"""

    nlp = spacy.load("en_core_web_sm")

    max_length = df["text"].apply(len).max()

    # Add nlp.max_length after cheking the longest text in the dataset (1158935 char) nlp.max_length default 1000000.
    if max_length > nlp.max_length:
        nlp.max_length = max_length + 500
    
    df["parsed"] = df["text"].apply(nlp)
    
    pickle_path = store_path / out_name
    with open(pickle_path, "wb") as f:
        pickle.dump(df, f)
    
    return df


""" Check printing for longest text
longest_text = df.loc[df["text_length"].idxmax(), "text"]
parsed_lt = nlp(longest_text)
print(f"Longest text number of tokens: {len(parsed_lt)}")
"""


' Check printing for longest text\nlongest_text = df.loc[df["text_length"].idxmax(), "text"]\nparsed_lt = nlp(longest_text)\nprint(f"Longest text number of tokens: {len(parsed_lt)}")\n'

In [59]:
store_path = Path.cwd() / "pickles"
df = parse(df)  
df = pd.read_pickle(store_path / "parsed.pickle")

In [None]:
def get_ttrs(df):
    """helper function to add ttr to a dataframe"""
    results = {}
    for i, row in df.iterrows():
        results[row["title"]] = nltk_ttr(row["text"])
    return results

In [78]:
d = dict_of_syl()

def get_fks(df):
    """helper function to add fk scores to a dataframe"""
    results = {}
    for i, row in df.iterrows():
        results[row["title"]] = round(fk_level(row["text"], d), 4)
    return results

In [None]:
def common_objects(doc, n=10):
    """returns a list of the ten most common syntactic objects overall in the text"""
    objects = [token.text.lower() for token in doc if token.dep_ == "dobj"]
    return Counter(objects).most_common(n)

# print(common_objects(df.loc[0, "parsed"], 10))

In [None]:
def subjects_by_verb_pmi(doc, target_verb):
    """Extracts the most common subjects of a given verb in a parsed document. Returns a list."""
    pass

In [None]:
def subjects_by_verb_count(doc, verb):
    """Extracts the most common subjects of a given verb in a parsed document. Returns a list."""
    pass


In [None]:
def adjective_counts(doc):
    """Extracts the most common adjectives in a parsed document. Returns a list of tuples."""
    pass

In [None]:
if __name__ == "__main__":
    """
    uncomment the following lines to run the functions once you have completed them
    """
    path = Path.cwd() / "p1-texts" / "novels"
    print(path)
    df = read_novels(path) # this line will fail until you have completed the read_novels function above.
    print(df.head())
    nltk.download("cmudict")
    parse(df)
    print(df.head())
    print(get_ttrs(df))
    print(get_fks(df))
    df = pd.read_pickle(Path.cwd() / "pickles" /"parsed.pickle")
    # print(adjective_counts(df))
    """ 
    for i, row in df.iterrows():
        print(row["title"])
        print(subjects_by_verb_count(row["parsed"], "hear"))
        print("\n")

    for i, row in df.iterrows():
        print(row["title"])
        print(subjects_by_verb_pmi(row["parsed"], "hear"))
        print("\n")
    """

/Users/jimena/NLP/nlp-coursework-2024-25-jimenajauregui-1/p1-texts/novels
                                                text                  title  \
0  \nCHAPTER 1\n\nThe family of Dashwood had long...  Sense_and_Sensibility   
1  'Wooed and married and a'.'\n'Edith!' said Mar...        North_and_South   
2  Book the First--Recalled to Life\n\n\n\n\nI. T...   A_Tale_of_Two_Cities   
3  SAMUEL BUTLER.\nAugust 7, 1901\n\nCHAPTER I: W...                Erewhon   
4  THE AMERICAN\n\nby Henry James\n\n\n1877\n\n\n...           The_American   

    author  year  
0   Austen  1811  
1  Gaskell  1855  
2  Dickens  1858  
3   Butler  1872  
4    James  1877  


[nltk_data] Downloading package cmudict to /Users/jimena/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


                                                text                  title  \
0  \nCHAPTER 1\n\nThe family of Dashwood had long...  Sense_and_Sensibility   
1  'Wooed and married and a'.'\n'Edith!' said Mar...        North_and_South   
2  Book the First--Recalled to Life\n\n\n\n\nI. T...   A_Tale_of_Two_Cities   
3  SAMUEL BUTLER.\nAugust 7, 1901\n\nCHAPTER I: W...                Erewhon   
4  THE AMERICAN\n\nby Henry James\n\n\n1877\n\n\n...           The_American   

    author  year                                             parsed  
0   Austen  1811  (\n, CHAPTER, 1, \n\n, The, family, of, Dashwo...  
1  Gaskell  1855  (', Wooed, and, married, and, a, ', ., ', \n, ...  
2  Dickens  1858  (Book, the, First, --, Recalled, to, Life, \n\...  
3   Butler  1872  (SAMUEL, BUTLER, ., \n, August, 7, ,, 1901, \n...  
4    James  1877  (THE, AMERICAN, \n\n, by, Henry, James, \n\n\n...  
{'Sense_and_Sensibility': {'Sense_and_Sensibility': 0.05850947122065384, 'North_and_South': 0.06580904022