# Exercise 9: Find 100 most frequent word forms in corpus


In [4]:
import os

## Get corpus files

In [5]:
corpus_path = "exercise-5/corpus"
print(corpus_path)

exercise-5/corpus


## List the files in the path

In [6]:
def traverse_directory(path):
  return [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

files = traverse_directory(corpus_path)
print(files[:15])

['exercise-5/corpus/henry-vi-part-2_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/a-midsummer-nights-dream_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/henry-viii_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/hamlet_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/henry-iv-part-1_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/loves-labors-lost_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/twelfth-night_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/the-two-noble-kinsmen_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/alls-well-that-ends-well_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/richard-ii_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/romeo-and-juliet_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/the-merchant-of-venice_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/as-you-like-it_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/the-taming-of-the-shrew_TXT_FolgerShakespeare.txt', 'exercise-5/corpus/henry-iv-part-2_TXT_FolgerShakespeare.txt']


## Reading files, tokenizing its content and returning a list of normalized tokens

In [7]:
def tokenize_file(path):
  tokens = ""
  with open(path, 'r') as f:
    tokens = f.read()
    
  tokens_as_list = tokens.lower().split()
  normalized_tokens = []
  
  for token in tokens_as_list:
    normalized_token = ''.join(char for char in token if char.isalnum())
    if normalized_token != "":
      normalized_tokens.append(normalized_token)
  
  return normalized_tokens

## Compute counts

In [8]:
def compute_counts(pathlist):
  counts = {}
  for path in pathlist:
    tokens = tokenize_file(path)
    for token in tokens:
      if token in counts:
        counts[token] = counts[token] + 1
      else:
        counts[token] = 1 
  return counts

counts = compute_counts(files)

print(list(counts.items())[:15])

[('henry', 621), ('vi', 9), ('part', 485), ('2', 219), ('by', 4161), ('william', 127), ('shakespeare', 88), ('edited', 42), ('barbara', 42), ('a', 15666), ('mowat', 42), ('and', 28303), ('paul', 47), ('werstine', 42), ('with', 8616)]


## Sort counts

In [9]:
def sort_counts(counts):
  sorted_tuples = sorted(counts.items(), key=lambda item: item[1], reverse=True)
  return sorted_tuples

sorted_counts = sort_counts(counts)
print(sorted_counts[:15])

[('the', 29311), ('and', 28303), ('to', 21931), ('i', 21599), ('of', 18434), ('a', 15666), ('you', 14599), ('my', 13080), ('in', 11929), ('that', 11706), ('is', 9874), ('not', 8982), ('with', 8616), ('me', 8170), ('for', 8106)]


## Write counts to CSV


In [10]:
def write_frequencies(frequencies, path):
  rank = 0
  sum = 0
  with open(path, 'w') as f:
    for word_type in frequencies:
      sum = sum + word_type[1]
  # TODO: open the file at path in write mode
    for word_type in frequencies:
      rank = rank + 1
      f.write(str(rank) + "," + str(word_type[0]) + "," + str(word_type[1]) + "," + str(word_type[1]/sum) + "\n") 

  return

write_frequencies(sorted_counts, 'frequencies.csv')

## Print result

In [30]:
def print_top_100(frequencies):
    rank = 1
    sum_all_counts = 0
    for word_type in frequencies:
        sum_all_counts = sum_all_counts + word_type[1]

    print("Rank\t" + "Word_form\t" + "Count\t" + "Frequency\n")
    for word_type in frequencies:
        frequency = word_type[1]/sum_all_counts
        print(str(rank) + "\t" + word_type[0] + "\t" + str(word_type[1]) + "\t" + str(frequency) + "\n")
        rank +=1
        if rank == 101:
            break
    return


print_top_100(sorted_counts)

Rank	Word_form	Count	Frequency

1	the	29311	0.030497283326847723

2	and	28303	0.02944848725733585

3	to	21931	0.02281859781792151

4	i	21599	0.02247316101724895

5	of	18434	0.01918006621565661

6	a	15666	0.01630003891366369

7	you	14599	0.015189854978972055

8	my	13080	0.013609377568665969

9	in	11929	0.01241179396151501

10	that	11706	0.012179768640581333

11	is	9874	0.010273623403135151

12	not	8982	0.009345522119400438

13	with	8616	0.008964709260827675

14	me	8170	0.008500658618960318

15	for	8106	0.008434068392324644

16	it	8099	0.008426785086286368

17	he	7946	0.008267592825735458

18	his	7649	0.00795857255525428

19	be	7293	0.0075881644195933415

20	this	7107	0.007394636573433413

21	your	7046	0.0073311677636712855

22	as	6967	0.007248970452667875

23	but	6682	0.006952435849680887

24	have	6243	0.006495668513851808

25	thou	5881	0.006119017544443774

26	him	5561	0.005786066411265401

27	so	5388	0.005606064704890844

28	will	5273	0.005486410391404867

29	what	4677	0.0048662889058