# Part 1: Preprocessing

In [7]:
import string

text = ""

for filename in ['bible.txt', 'quran.txt']:
    with open(filename, 'r', encoding = 'utf-8') as file:
        text += file.read() + ' '

# Case folding
text = text.lower()

# Tokenization
def tokenizer(text):
    text = text.translate(str.maketrans('','',string.punctuation))
    token = text.split()
    return token

tokens = tokenizer(text)

In [13]:
# Calculate term frequency
term_freq = {}
for term in tokens:
    if term in term_freq:
        term_freq[term] += 1
    else:
        term_freq[term] = 1

term_freq_desc = sorted(term_freq.items(), key=lambda x: x[1], reverse=True)
term_freq_desc

[('the', 72143),
 ('and', 57540),
 ('of', 38509),
 ('to', 17242),
 ('in', 14958),
 ('that', 14943),
 ('he', 12735),
 ('shall', 11121),
 ('for', 10699),
 ('a', 10439),
 ('they', 10202),
 ('is', 9840),
 ('i', 9507),
 ('his', 9418),
 ('unto', 9000),
 ('lord', 8780),
 ('them', 8569),
 ('be', 8327),
 ('not', 8277),
 ('it', 7760),
 ('him', 7714),
 ('with', 7215),
 ('you', 7073),
 ('all', 6129),
 ('thou', 5474),
 ('will', 5392),
 ('have', 5285),
 ('their', 5213),
 ('which', 5039),
 ('was', 4928),
 ('but', 4902),
 ('my', 4827),
 ('from', 4683),
 ('said', 4613),
 ('thy', 4600),
 ('god', 4552),
 ('me', 4505),
 ('are', 4352),
 ('as', 4112),
 ('ye', 3983),
 ('thee', 3826),
 ('we', 3673),
 ('when', 3672),
 ('this', 3350),
 ('were', 3260),
 ('upon', 3120),
 ('by', 3084),
 ('your', 2957),
 ('out', 2903),
 ('there', 2786),
 ('allah', 2739),
 ('then', 2685),
 ('who', 2666),
 ('man', 2649),
 ('israel', 2608),
 ('on', 2567),
 ('people', 2505),
 ('up', 2466),
 ('son', 2435),
 ('do', 2430),
 ('had', 2342),

In [11]:
import matplotlib.pyplot as plt

plt.plot(term_freq_desc.keys(), term_freq_desc.values())
plt.title("Unique terms with frequency")
plt.xlabel("Terms")
plt.ylabel("Frequency")

plt.loglog()
plt.show()


AttributeError: 'list' object has no attribute 'keys'

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

# Stopping - Remove English stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = []
for word in tokens:
    if word not in stop_words:
        filtered_tokens.append(word)

# Normalization - Perform Porter stemming
porter = PorterStemmer()
stemmed_tokens = []
for word in filtered_tokens:
    stemmed_tokens.append(porter.stem(word))
    
print(stemmed_tokens[:200])

In [None]:
# Save preprocessed tokens to new files
with open('bible_preprocessed.txt', 'w', encoding='utf-8') as bible_file:
    bible_file.write(" ".join(stemmed_tokens))

with open('quran_preprocessed.txt', 'w', encoding='utf-8') as quran_file:
    quran_file.write(" ".join(stemmed_tokens))

Compare the processed file to the new file. Are there any surprises? Discuss
what kind of modifications in preprocessing could be applied. For example:
- Additional words/terms to be filtered out
- Special tokenization
- Additional normalization to some terms

In [9]:
example = "politician: politicians, politics policy"
tokensEx = tokenizer(example)
tokensEx

['politician', 'politicians', 'politics', 'policy']

# Part 2: Text Laws

In [4]:
import matplotlib.pyplot as plt

# Calculate term frequency
term_freq = {}
for term in stemmed_tokens:
    if term in term_freq:
        term_freq[term] += 1
    else:
        term_freq[term] = 1