In [1]:
import nltk
import spacy
from pathlib import Path
import pandas as pd
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import pickle
from collections import Counter
import math




nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000
                             
        
                             
    

In [2]:
def read_novels(path=Path.cwd() / "p1-texts" / "novels"):
    data = []
    for file in path.glob("*.txt"):
        title_name = file.stem
        parts = title_name.split("-")

        if len(parts) >= 3:
            title = ' '.join(part.replace('_', ' ') for part in parts[:-2])
            author = parts[-2].replace('_', ' ')
            year = parts[-1]

            with file.open(encoding='utf-8') as f:
                text = f.read()

            data.append({
                'text': text,
                'title': title,
                'author': author,
                'year': int(year)  
            })

    df = pd.DataFrame(data)

    df = df.sort_values('year').reset_index(drop=True)

    return df

In [6]:
def nltk_ttr(text):
    tokens = word_tokenize(text)

    words = [token.lower() for token in tokens if token.isalpha()]
    if not words:
        return 0

    types = set(words)
    ttr = len(types) / len(words)
    return ttr    

In [7]:
def get_ttrs(df):
    """helper function to add ttr to a dataframe"""
    results = {}
    for i, row in df.iterrows():
        results[row["title"]] = nltk_ttr(row["text"])
    return results

In [8]:
def count_syl(word, d):
    word = word.lower()
    if word in d:
        return len([pron for pron in d[word][0] if pron[-1].isdigit()])
    
    vowels = 'aeiouy'
    syllable_count = 0
    prev_letter_was_vowel = False

    for letter in word:
        if letter in vowels and not prev_letter_was_vowel:
            syllable_count += 1
            prev_letter_was_vowel = True
        else:
            prev_letter_was_vowel = False

    return max(1, syllable_count)

In [9]:
def fk_level(text, d):
    sentences = sent_tokenize(text)
    words = []
    for sentence in sentences:
        words.extend([word.lower() for word in word_tokenize(sentence) if word not in string.punctuation and word.isalpha()])
        
    if not words or not sentences:
        return 0

    total_syllables = sum(count_syl(word, d) for word in words)
    total_words = len(words)
    total_sentences = len(sentences)

    fk_score = (0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59)

    return fk_score

In [10]:
def get_fks(df):
    """helper function to add fk scores to a dataframe"""
    results = {}
    cmudict = nltk.corpus.cmudict.dict()
    for i, row in df.iterrows():
        results[row["title"]] = round(fk_level(row["text"], cmudict), 4)
    return results

In [11]:
def parse(df, store_path=Path.cwd() / "pickle", out_name="parsed.pickle"):
    store_path.mkdir(parents=True, exist_ok=True)
    output_file = store_path / out_name

    df['doc'] = df['text'].apply(lambda x: nlp(x))

    with open(output_file, 'wb') as f:
        pickle.dump(df, f)
    print(f"Saved parsed DataFrame to {output_file}")

    return df

In [4]:
if __name__ == "__main__":
    """
    uncomment the following lines to run the functions once you have completed them
    """
    path = Path.cwd() / "p1-texts" / "novels"
    print(path)
    df = read_novels(path) # this line will fail until you have completed the read_novels function above.
    print(df.head())
    nltk.download("cmudict")
    nltk.download("punkt")
    #parse(df)
    #print(df.head())
    print(get_ttrs(df))
    print(get_fks(df))
    df = pd.read_pickle(Path.cwd() / "pickles" /"parsed.pickle")
    
    for i, row in df.iterrows():
        print(row["title"])
        print(adjective_counts(row["doc"]))
        print("\n")       
    
    for i, row in df.iterrows():
        print(row["title"])
        print(subjects_by_verb_count(row["doc"], "hear"))
        print("\n")
    
    for i, row in df.iterrows():
        print(row["title"])
        print(subjects_by_verb_pmi(row["doc"], "hear"))
        print("\n")

NameError: name 'novels_df' is not defined