In [58]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import cmudict
import spacy
from pathlib import Path
import pandas as pd
import string
import re

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000


In [None]:
#Part A:Read Novels 
def read_novels(path=Path.cwd() / "texts" / "novels"):
    """Reads texts from a directory of .txt files and returns a DataFrame with the text, title,
    author, and year"""
    texts = []
    titles = []
    authors = []
    years = []

    for file in path.glob("*.txt"):
        with open(file, "r", encoding="utf-8") as f:
            text = f.read()
            texts.append(text)

            filename = file.stem
            title, author, year = filename.split('-')

            titles.append(title.replace('_', ' '))
            authors.append(author)
            years.append(int(year))
        
    df = pd.DataFrame({
        "text": texts,
        "title": titles,
        "author": authors,
        "year": years
    })
    
    df = df.sort_values('year').reset_index(drop=True)
    return df

In [18]:
read_novels(path=Path.cwd() / "texts" / "novels") 

Unnamed: 0,text,title,author,year
0,\nCHAPTER 1\n\nThe family of Dashwood had long...,Sense and Sensibility,Austen,1811
1,'Wooed and married and a'.'\n'Edith!' said Mar...,North and South,Gaskell,1855
2,Book the First--Recalled to Life\n\n\n\n\nI. T...,A Tale of Two Cities,Dickens,1858
3,"SAMUEL BUTLER.\nAugust 7, 1901\n\nCHAPTER I: W...",Erewhon,Butler,1872
4,THE AMERICAN\n\nby Henry James\n\n\n1877\n\n\n...,The American,James,1877
5,\nThe Picture of Dorian Gray\n\nby\n\nOscar Wi...,Dorian Gray,Wilde,1890
6,Phase the First: The Maiden\n\n\nI\n\n\nOn an ...,Tess of the DUrbervilles,Hardy,1891
7,BOOK FIRST: THE PRINCE\n\n\n\n\nPART FIRST\n\n...,The Golden Bowl,James,1904
8,THE SECRET GARDEN\n\nBY FRANCES HODGSON BURNET...,The Secret Garden,Burnett,1911
9,Chapter 1\n\nOnce upon a time and a very good ...,Portrait of the Artist,Joyce,1916


In [None]:
def nltk_ttr(df):
    """Calculates the type-token ratio of a text. Returns a mapped dictionary title -> TTR."""
    ttr_dict = {}
    for index, row in df.iterrows():
        tokens = word_tokenize(row['text'])
        words = [word.lower() for word in tokens if word.isalpha()]
        types = set(words)
        ttr = len(types) / len(words) if len(words) > 0 else 0
        ttr_dict[row['title']] = ttr

    return ttr_dict

In [66]:
def count_syl(word, d):
    """
    Args:
        word (str): The word to count syllables for.
        d (dict): A dictionary of syllables per word.

    Returns:
        int: The number of syllables in the word.
    """
    word = word.lower()
    if word in d:
        return len([syl for syl in d[word][0] if syl[-1].isdigit()])
    else:
        vowels = "aeiouy"
        syllable_count = 0
        prev_char_was_vowel = False
        
        for char in word:
            if char in vowels:
                if not prev_char_was_vowel:
                    syllable_count += 1
                    prev_char_was_vowel = True
            else:
                prev_char_was_vowel = False
        
        if word.endswith('e') and syllable_count > 1:
            syllable_count -= 1

        return max(syllable_count, 1) 

def flesch_kincaid(df):
    """Returns the Flesch-Kincaid Grade Level of a text (higher grade is more difficult).
    Requires a dictionary of syllables per word.

    Args:
        df (pd.DataFrame): A DataFrame containing the text to analyze.

    Returns:
        dict: A dictionary mapping titles to their Flesch-Kincaid Grade Level.
    """
    d = cmudict.dict()
    fk_dict = {}
    
    for index, row in df.iterrows():
        text = row['text']
        sentences = sent_tokenize(text)
        words = [w for w in word_tokenize(text) if w.isalpha()]
        
        num_sentences = len(sentences)
        num_words = len(words)
        num_syllables = sum(count_syl(word, d) for word in words)
        
        if num_words > 0 and num_sentences > 0:
            fk_grade_level = (0.39 * (num_words / num_sentences)) + (11.8 * (num_syllables / num_words)) - 15.59
            fk_dict[row['title']] = fk_grade_level
        else:
            fk_dict[row['title']] = 0

    return fk_dict
