In [15]:
"""
Name: Rishi Raj Das
Student Number: 165212234
Description: This program reads a text file, processes its contents,
counts how often each word appears using a dictionary (hash table),
and prints the 20 most frequent words in a table format.
It also implements a bonus feature that computes bigram frequencies
(pairs of consecutive words) and prints the 10 most frequent bigrams.
"""
def read_text(filename):
    #Reads the contents of a text file and returns it as a string.
    try:
        # Open the file and read all text into a single string
        with open(filename, "r") as f:
            text = f.read()
        return text
    except FileNotFoundError:
        print(f"Error: The file {filename} was not found.")
        return ""
    except Exception as e:
        print(f"Unexpected error:{e}")
        return ""

def preprocess(text):
    #Converts text to lowercase, removes punctuation, and splits it into a list of words.
    text = text.lower() #standardizing lowercase
    cleaned_text = ""

    for ch in text:
        if ch.isalpha() or ch.isspace():
            cleaned_text += ch  #Keep letters and spaces
        else:
            cleaned_text += " " #replace punctuation with spaces
    return cleaned_text.split()
'''
Other way to do it, but inefficient.
And IDK if I am able to use 'import string' for the project or not. ¯\_(ツ)_/¯
    punctuations = ".,!?;:\"'()[]{}<>-_/\\|@#$%^&*~`+="
    for p in punctuations:
        text_no_punc = text.replace(p, " ")
    words = text_no_punc.split()
'''
def count_words(words):
    #Builds a dictionary mapping each word to its frequency.
    word_freq = {} #Dictionary used as hash table

    for wd in words:
        if wd in word_freq:
            word_freq[wd] += 1   #Icrements the count if word has been encountered before
        else:
            word_freq[wd] = 1    #Catches the first occurence
    
    return word_freq

def get_top(word_freq, n=20):
    #Convert dictionary items to list and sorts by frequency
    sorted_list = sorted(word_freq.items(), key=lambda item: item[1], reverse=True)
    return sorted_list[:n]

def display_func(top_words):
    print("Top 20 Word Frequencies")
    print("-"*27)
    #Literally spent 15 minutes looking up word alignments.(--_--)
    print(f"{'Word':<15} {'Frequency':>10}")
    print("-"*27)

    for word, freq in top_words:
        print(f"{word:<15}{freq:>10}")

#Bonus Task: Bigram Frequency
def identify_bigram(words):
    bigrams = []
    for i in range(len(words) - 1):
        word_pair = (words[i], words[i + 1]) #Putting the pair in a tuple
        bigrams.append(word_pair)
    return bigrams

def count_bigrams(bigrams):
    #Same logic as count_words()
    bigram_freq = {}

    for bg in bigrams:
        if bg in bigram_freq:
            bigram_freq[bg] += 1
        else:
            bigram_freq[bg] = 1
    return bigram_freq

def display_func_bigram(top_bigrams):
    print("\nTop 10 Bigram Frequencies")
    print("--------------------------")
    print(f"{'Bigram':<25} {'Frequency':>10}")
    print("-" * 37)

    for (w1, w2), freq in top_bigrams:
        bigram_str = f"{w1} {w2}"     #converting tuple to readable text
        print(f"{bigram_str:<25} {freq:>10}")
    
def main():
    filename = "adventures_of_huckleberry_finn.txt"
    text = read_text(filename)

    words = preprocess(text)
    word_frequency = count_words(words)
    top_words = get_top(word_frequency, n=20)
    print()
    display_func(top_words)

    #Bonus Task
    bigrams = identify_bigram(words)
    bigram_freq = count_bigrams(bigrams)
    top_bigrams = get_top(bigram_freq, n=10) #Reusing get_top function for the sorted list
    print()
    display_func_bigram(top_bigrams)

if __name__ == '__main__':
    main()


Top 20 Word Frequencies
---------------------------
Word             Frequency
---------------------------
the                  1574
and                  1061
of                    726
a                     702
to                    675
it                    543
in                    523
he                    493
was                   427
his                   418
i                     365
scrooge               363
that                  349
with                  267
you                   245
s                     237
as                    228
said                  221
had                   206
him                   198


Top 10 Bigram Frequencies
--------------------------
Bigram                     Frequency
-------------------------------------
in the                           144
of the                           106
it was                            98
said scrooge                      96
the ghost                         81
original manuscript               71
manuscript of       