In [78]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.stem.porter import PorterStemmer
from nltk import FreqDist  # Importing FreqDist

In [79]:
def load_corpus():
    # corpus_path = "/Users/emmanueladegboyega/SDA 250 - FALL 2024/assignment1/corpus/"
    corpus_path = "./corpus/"
    corpus = PlaintextCorpusReader(corpus_path, ".*")
    return corpus

In [80]:
def get_length(subcorpus):
    return len(subcorpus)

In [81]:
def get_lexical_diversity(subcorpus):
    return len(set(subcorpus)) / len(subcorpus)

In [82]:
from nltk import FreqDist 

def get_top_ten_most_frequent_words(subcorpus):  # Function definition
    # All lines inside the function must be indented
    words = [word for word in subcorpus if word.isalpha()]
    fdist = FreqDist(words)  # Create a frequency distribution of words in the subcorpus

    # Get the 10 most common words and their counts
    most_common_words = fdist.most_common(10)
    
    # Convert the list of tuples into a dictionary
    return dict(most_common_words)

In [83]:
from nltk import FreqDist

def get_words_with_at_least_10_characters(subcorpus):
    # Filter words that have at least 10 characters
    long_words = [word for word in subcorpus if len(word) >= 10]
    
    # Create a frequency distribution of those words
    fdist = FreqDist(long_words)

    # We're only returning the first 10 since there are way too many
    most_common = fdist.most_common(10)
    
    # Convert the frequency distribution to a dictionary and return it
    return dict(most_common)

In [84]:
def get_longest_sentence(corpus: PlaintextCorpusReader, subcorpus_filename: str):
    sentences = corpus.sents(subcorpus_filename)
    longest_sentence = max(sentences, key=len)
    return longest_sentence

In [85]:
def get_sentence_length(longest_sentence):
    return len(longest_sentence)

In [86]:
def get_sentence_stemmed(sentence: list[str]):
    stemmer = PorterStemmer()
    sentence_stemmed = [stemmer.stem(word) for word in sentence]
    return sentence_stemmed

In [87]:
def analyze(subcorpus_filename: str):
    corpus = load_corpus()
    subcorpus = corpus.words(subcorpus_filename)

    print("\033[1mAnalysis for\033[0m", subcorpus_filename)
    print("")

    length = get_length(subcorpus)
    print("\033[1mLength:\033[0m", length)
    print("")

    lexical_diversity = get_lexical_diversity(subcorpus)
    print("\033[1mLexical diversity:\033[0m", lexical_diversity)
    print("")

    top_ten_most_frequent_words = get_top_ten_most_frequent_words(subcorpus)
    print("\033[1mTop 10 most frequent words:\033[0m")
    for word in top_ten_most_frequent_words:
        print(word, top_ten_most_frequent_words[word])
    print("")

    words_with_at_least_10_characters = get_words_with_at_least_10_characters(subcorpus)
    print("\033[1mWords with at least 10 characters (top 10):\033[0m") 
    for word in words_with_at_least_10_characters:
        print(word, words_with_at_least_10_characters[word])
    print("")
    
    longest_sentence = get_longest_sentence(corpus, subcorpus_filename)
    longest_sentence_str = " ".join(longest_sentence)
    longest_sentence_length = get_sentence_length(longest_sentence)
    print("\033[1mLongest sentence:\033[0m", longest_sentence_str)
    print("\033[1mThe longest sentence is\033[0m", longest_sentence_length, "words long")
    print("")

    longest_sentence_stemmed = get_sentence_stemmed(longest_sentence)
    longest_sentence_stemmed_str = " ".join(longest_sentence_stemmed)
    print("\033[1mLongest sentence stemmed:\033[0m", longest_sentence_stemmed_str)


In [88]:
# Replace this with the subcorpus filename you want to analyze
analyze("text_spok.txt")
print("============================================================================================================================================")
analyze("text_fic.txt")
print("============================================================================================================================================")
analyze("text_news.txt")

[1mAnalysis for[0m text_spok.txt

[1mLength:[0m 1294949

[1mLexical diversity:[0m 0.025391733574063536

[1mTop 10 most frequent words:[0m
the 38047
to 25177
of 19615
a 18867
and 17712
that 17657
I 17457
you 13997
in 13810
s 13559

[1mWords with at least 10 characters:[0m
government 586
Washington 412
Republican 333
everything 320
Republicans 301
understand 293
administration 290
UNIDENTIFIED 247
information 195
investigation 179

[1mLongest sentence:[0m They ' ve allowed Missouri to say that for incompetent persons who have not left absolutely clear and explicit instructions that Missouri may say that the only thing that matters is that they live longer and that as long as we can keep them living longer , we must , even if it means suffering , even if it means travail for the @ @ @ @ @ @ @ @ @ @ terrible pain , it does n ' t matter , Missouri may say that preservation of life is it , and then they can make a procedural standard that guarantees that almost everyone ends up i