In [1]:
import nltk
from nltk.probability import FreqDist
from nltk import bigrams

# Read the corpus data
with open('./frequency_data/SubtlexUS/Subtlex.US.txt', encoding='iso8859_2') as f:
    lines = f.readlines()

    # Build the corpus
    # Split lines into words and lowercase them.
    split_lines = [line.split() for line in lines]
    words = [word.lower() for line in split_lines for word in line]

    # Strip punctuation and remove non-alphabetic words
    words = [word.strip("-,.:;\"'?!()*‘’") for word in words]
    words = [word for word in words if word.replace("-", "").isalpha()]
    
    # Build both frequency and bigram frequency distributions
    frequency_dist = FreqDist(words)
    bigram_frequency_dist = FreqDist(bigrams(words))



In [2]:
# Print a set of all the characters in the corpus
print("Set of all characters in the corpus:")

# Create a set of all characters in the corpus
characters = set()
for word in words:
    for character in word:
        characters.add(character)

# Print the set of characters 
print(characters)


Set of all characters in the corpus:
{'ş', 'ď', 'ž', 'ń', 'ć', 'ř', 'á', 'g', 'c', 'ĺ', 'é', 'ŕ', 'ą', 't', 'y', 'ç', 'ý', 'ź', 'd', 'ü', 'đ', 'ë', 'b', 'u', 'n', 'ő', 'x', 'ľ', 's', 'â', 'ß', 'ţ', 'e', 'l', 'k', 'š', 'ű', 'm', 'î', 'a', 'ô', 'j', 'ö', 'ś', 'ă', 'h', 'ä', '-', 'w', 'ň', 'ú', 'ę', 'ť', 'ˇ', 'r', 'ł', 'p', 'č', 'ů', 'v', 'í', 'i', 'ż', 'z', 'o', 'q', 'f', 'ó', 'ě'}


In [3]:
def get_bigram_freq(word1, word2):
    return bigram_frequency_dist[(word1, word2)]

def get_freq(word):
    return frequency_dist[word]

In [4]:
print("Apple freq:", get_freq('apple'))
print("Apple tree: ", get_bigram_freq('apple', 'trees'))

Apple freq: 1168
Apple tree:  10


In [5]:
# Write the frequency data to a file
with open('./frequency_data/word_freq.csv', 'w', encoding="utf-8") as f:
    for word in frequency_dist:
        f.write(word + ',' + str(get_freq(word)) + '\n')

with open('./frequency_data/bigram_freq.csv', 'w', encoding="utf-8") as f:
    for bigram in bigram_frequency_dist:
        f.write(bigram[0] + ',' + bigram[1] + ',' + str(get_bigram_freq(bigram[0], bigram[1])) + '\n')