In [None]:
#imported libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import cmudict
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
import spacy
from pathlib import Path
import pandas as pd
import string
import re
import pickle
from collections import Counter
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 20000000 


In [2]:
#Part A:Read Novels 
def read_novels(path=Path.cwd() / "texts" / "novels"):
    """Reads texts from a directory of .txt files and returns a DataFrame with the text, title,
    author, and year"""
    texts = []
    titles = []
    authors = []
    years = []

    for file in path.glob("*.txt"):
        with open(file, "r", encoding="utf-8") as f:
            text = f.read()
            texts.append(text)

            filename = file.stem
            title, author, year = filename.split('-')

            titles.append(title.replace('_', ' '))
            authors.append(author)
            years.append(int(year))
        
    df = pd.DataFrame({
        "text": texts,
        "title": titles,
        "author": authors,
        "year": years
    })
    
    df = df.sort_values('year').reset_index(drop=True)
    return df

In [3]:
# Part B: type-token ratio (TTR)
def nltk_ttr(df):
    """Calculates the type-token ratio of a text. Returns a mapped dictionary title -> TTR."""
    ttr_dict = {}
    for index, row in df.iterrows():
        tokens = word_tokenize(row['text'])
        words = [word.lower() for word in tokens if word.isalpha()]
        types = set(words)
        ttr = len(types) / len(words) if len(words) > 0 else 0
        ttr_dict[row['title']] = ttr

    return ttr_dict

In [4]:
# Part C: Flesch-Kincaid Grade Level
def count_syl(word, d):
    """
    Args:
        word (str): The word to count syllables for.
        d (dict): A dictionary of syllables per word.

    Returns:
        int: The number of syllables in the word.
    """
    word = word.lower()
    if word in d:
        return len([syl for syl in d[word][0] if syl[-1].isdigit()])
    else:
        vowels = "aeiouy"
        syllable_count = 0
        prev_char_was_vowel = False
        
        for char in word:
            if char in vowels:
                if not prev_char_was_vowel:
                    syllable_count += 1
                    prev_char_was_vowel = True
            else:
                prev_char_was_vowel = False
        
        if word.endswith('e') and syllable_count > 1:
            syllable_count -= 1

        return max(syllable_count, 1) 

def flesch_kincaid(df):
    """Returns the Flesch-Kincaid Grade Level of a text (higher grade is more difficult).
    Requires a dictionary of syllables per word.

    Args:
        df (pd.DataFrame): A DataFrame containing the text to analyze.

    Returns:
        dict: A dictionary mapping titles to their Flesch-Kincaid Grade Level.
    """
    d = cmudict.dict()
    fk_dict = {}
    
    for index, row in df.iterrows():
        text = row['text']
        sentences = sent_tokenize(text)
        words = [w for w in word_tokenize(text) if w.isalpha()]
        
        num_sentences = len(sentences)
        num_words = len(words)
        num_syllables = sum(count_syl(word, d) for word in words)
        
        if num_words > 0 and num_sentences > 0:
            fk_grade_level = (0.39 * (num_words / num_sentences)) + (11.8 * (num_syllables / num_words)) - 15.59
            fk_dict[row['title']] = fk_grade_level
        else:
            fk_dict[row['title']] = 0

    return fk_dict


In [5]:
#part E: parse text with spaCy 
def parse(df, store_path=Path.cwd() / "pickles", out_name="parsed.pkl"):
    """Parses the text of a DataFrame using spaCy, stores the parsed docs as a column and writes 
    the resulting  DataFrame to a pickle file
    Args:
        df (pd.DataFrame): DataFrame containing the text to parse.
        store_path (Path): Path to store the pickle file.
        out_name (str): Name of the output pickle file.
        Returns:
        pd.DataFrame: The original DataFrame with an additional 'parsed' column containing spaCy Doc objects.
        """
    
    df['parsed'] = df['text'].apply(nlp)

    with open(store_path / out_name, 'wb') as f:
        pickle.dump(df, f)

    return df

In [6]:
# Part E: Load parsed DataFrame from pickle
def load_parsed(store_path=Path.cwd() / "pickles", in_name="parsed.pkl"):
    """Loads a parsed DataFrame from a pickle file"""
    with open(store_path / in_name, 'rb') as f:
        df = pickle.load(f)
    return df

In [None]:

# Part F: Working with parses: 
def objects_most_common(doc):
    """The title of each novel and a list of the ten most common syntactic objects overall in the text."""
    objects = []
    for token in doc:
        if token.dep_ in ('dobj', 'iobj'):
            objects.append(token.lemma_.lower())
    return Counter(objects).most_common(10)


def subjects_by_verb_count(doc, verb):
    """
    The title of each novel and a list of the ten most common syntactic subjects of 
    the verb ‘to hear’ (in any tense) in the text, ordered by their frequency
    """
    subjects = []
    for token in doc:
        if token.dep_ in ('nsubj', 'nsubjpass') and token.head.lemma_.lower() == verb.lower():
            subjects.append(token.lemma_.lower())
    return Counter(subjects).most_common(10)
    

def subjects_by_verb_pmi(doc, target_verb):
    """
    The title of each novel and a list of the ten most common syntactic subjects of 
    the verb ‘to hear’ (in any tense) in the text, ordered by their Pointwise Mutual Information
    """
    
    subject_verb_pairs = []
    for token in doc:
        if token.dep_ in ('nsubj', 'nsubjpass') and token.head.pos_ == 'VERB':
            subject = token.lemma_.lower()
            verb = token.head.lemma_.lower()
            subject_verb_pairs.append((subject, verb))
    if not subject_verb_pairs:
        return []

    # Create a BigramCollocationFinder and score the bigrams using PMI
    finder = BigramCollocationFinder.from_documents([pair for pair in subject_verb_pairs])
    pmi_scorer = BigramAssocMeasures.pmi

    # filter for our target verb.
    pmi_scores = finder.score_ngrams(pmi_scorer)
    target_verb_subjects = []
    for (subject, verb), score in pmi_scores:
        if verb == target_verb.lower():
            target_verb_subjects.append((subject, score))
            
    return target_verb_subjects[:10]


In [None]:

path = Path.cwd() / "p1-texts" / "novels"
print(path)
df = read_novels(path) 
print(df.head())

parse(df)
print(df.head())
print(nltk_ttr(df))
print(flesch_kincaid(df))

df = pd.read_pickle(Path.cwd() / "pickles" /"parsed.pkl")


for i, row in df.iterrows():
    print(row["title"])
    print(objects_most_common(row["parsed"]))
    print("\n")

for i, row in df.iterrows():
    print(row["title"])
    print(subjects_by_verb_count(row["parsed"], "hear"))
    print("\n")

for i, row in df.iterrows():
    print(row["title"])
    print(subjects_by_verb_pmi(row["parsed"], "hear"))
    print("\n")



c:\Users\Baleid\Desktop\BirkBeck Study\NLP\Coursework\nlp-coursework-2024-25-N-PolarStar\p1-texts\novels
                                                text                  title  \
0  \nCHAPTER 1\n\nThe family of Dashwood had long...  Sense and Sensibility   
1  'Wooed and married and a'.'\n'Edith!' said Mar...        North and South   
2  Book the First--Recalled to Life\n\n\n\n\nI. T...   A Tale of Two Cities   
3  SAMUEL BUTLER.\nAugust 7, 1901\n\nCHAPTER I: W...                Erewhon   
4  THE AMERICAN\n\nby Henry James\n\n\n1877\n\n\n...           The American   

    author  year  
0   Austen  1811  
1  Gaskell  1855  
2  Dickens  1858  
3   Butler  1872  
4    James  1877  
                                                text                  title  \
0  \nCHAPTER 1\n\nThe family of Dashwood had long...  Sense and Sensibility   
1  'Wooed and married and a'.'\n'Edith!' said Mar...        North and South   
2  Book the First--Recalled to Life\n\n\n\n\nI. T...   A Tale of Two 