# Exerice 9
Reimplementation of Exercise 5

In [1]:
import os

## Traverse Directory

This function takes a path and returns a list
of all the .txt files that lie within it.

In [2]:
def traverse_directory(path):
    return [
        os.path.join(path, f)
        for f in os.listdir(path)
        if os.path.isfile(os.path.join(path, f))
    ]

## Tokenize File
This function takes a path, opens the file
reads its contents and tokenizes them. Returns
a list of normalized tokens.

In [3]:
def tokenize_file(path):
    f = open(path)
    content = f.read()
    tokens = content.split()
    normalized_tokens = [token.lower().strip(",.!?[]()=-") for token in tokens]
    for token in normalized_tokens:
        if len(token) == 0:
            normalized_tokens.remove(token)

    return normalized_tokens

## Compute Counts
This function takes a list of paths, and for every
file it calls tokenize_file. Then it populates a
dictionary that for every token lists how many
times it occurs in the entire corpus, so { word : count }

In [4]:
def compute_counts(pathlist):
    counts = {}
    for path in pathlist:
        tokens = tokenize_file(path)
        for token in tokens:
            if token in counts:
                counts[token] = counts[token] + 1
            else:
                counts[token] = 1

    return counts

## Sort Counts
Dictionaries are great, but they have no order.
{ key1 : value1, key2 : value2 } is the same as
{ key2 : value2, key1 : value1 }
So if we want to order a dictionary, we need to resort to
lists again, which are ordered (using indexes).
This function takes a dictionary as the
compute_counts() creates, and returns a list of lists

In [5]:
def sort_counts(counts):
    sorted_tuples = sorted(
        counts.items(), key=lambda item: item[1], reverse=True
    )
    return sorted_tuples

## Write Frequencies
This function takes a list of lists as is produced by
sort_counts(), opens a new file handle and
writes the frequencies in csv format to that file

In [6]:
def write_frequencies(frequencies, path):
    rank = 0
    s = 0
    for t in frequencies:
        s += t[1]
    with open(path, "w") as f:
        for t in frequencies:
            rank += 1
            f.write(
                str(rank)
                + ","
                + str(t[0])
                + ","
                + str(t[1])
                + ","
                + str(t[1] / s)
                + "\n"
            )
    return

## Call the functions
Next we have to call all the functions individually to generate our output file with the frequencies within.

In [7]:
files = traverse_directory(os.path.join("", "..", "exercise-5", "corpus"))

In [8]:
counts = compute_counts(files)

In [9]:
sorted_counts = sort_counts(counts)

In [10]:
write_frequencies(sorted_counts, 'frequencies.csv')

## Print 100 most frequent words
This function prints the 100 most frequent words

In [11]:
def top100(frequencies):
    rank = 1
    sum_counts = 0
    for entry in frequencies:
        sum_counts += entry[1]

    print("Rank\tWord\tCount\tFrequency\n")
    for entry in frequencies:
        frequency = entry[1]/sum_counts
        print(str(rank) + "\t" + entry[0] + "\t" + str(entry[1]) + "\t" + str(frequency) + "\n")
        rank += 1
        if rank ==  101:
            break

    return

## Show the final output

In [12]:
top100(sorted_counts)

Rank	Word	Count	Frequency

1	the	29236	0.03041915295415173

2	and	28282	0.029426545486705407

3	to	21904	0.022790433927614567

4	i	21122	0.021976787135640746

5	of	18427	0.01917272306355705

6	a	15612	0.016243802706259983

7	you	14493	0.015079517846645268

8	my	13055	0.013583323362171667

9	in	11907	0.012388864900297053

10	that	11684	0.012156840303608868

11	is	9841	0.010239255856540128

12	not	8911	0.009271619646136478

13	with	8608	0.008956357525972708

14	for	8087	0.008414273154337976

15	it	8022	0.008346642666514064

16	me	8009	0.008333116568949282

17	he	7925	0.008245717169299921

18	his	7626	0.00793461692530993

19	be	7250	0.007543400564974691

20	your	7040	0.007324902065851286

21	this	6991	0.007273919082722491

22	as	6952	0.007233340790028145

23	but	6668	0.006937847581689826

24	have	6229	0.006481081671617565

25	thou	5864	0.0061013104707602185

26	him	5445	0.005665353941556854

27	so	5310	0.005524890620691808

28	will	5233	0.00544477450434656

29	what	4652	0.0048402619901051