# Part 1: Preprocessing

In [3]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

text = ""

for filename in ['bible.txt', 'quran.txt']:
    with open(filename, 'r', encoding = 'utf-8') as file:
        text += file.read() + ' '

# Case folding
text = text.lower()

# Tokenization
def tokenizer(text):
    text = text.translate(str.maketrans('','',string.punctuation))
    token = text.split()
    return token

tokens = tokenizer(text)

# Stopping - Remove English stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = []
for word in tokens:
    if word not in stop_words:
        filtered_tokens.append(word)

# Normalization - Perform Porter stemming
porter = PorterStemmer()
stemmed_tokens = []
for word in filtered_tokens:
    stemmed_tokens.append(porter.stem(word))
    
print(stemmed_tokens[:100])

# Save preprocessed tokens to new files
with open('bible_preprocessed.txt', 'w', encoding='utf-8') as bible_file:
    bible_file.write(" ".join(stemmed_tokens))

with open('quran_preprocessed.txt', 'w', encoding='utf-8') as quran_file:
    quran_file.write(" ".join(stemmed_tokens))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\2533a\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['\ufeffthe', 'project', 'gutenberg', 'ebook', 'king', 'jame', 'version', 'bibl', 'ebook', 'use', 'anyon', 'anywher', 'unit', 'state', 'part', 'world', 'cost', 'almost', 'restrict', 'whatsoev', 'may', 'copi', 'give', 'away', 'reus', 'term', 'project', 'gutenberg', 'licens', 'includ', 'ebook', 'onlin', 'wwwgutenbergorg', 'locat', 'unit', 'state', 'check', 'law', 'countri', 'locat', 'use', 'ebook', 'titl', 'king', 'jame', 'version', 'bibl', 'releas', 'date', 'august', '1', '1989', 'ebook', '10', 'recent', 'updat', 'may', '1', '2023', 'languag', 'english', 'start', 'project', 'gutenberg', 'ebook', 'king', 'jame', 'version', 'bibl', 'old', 'testament', 'king', 'jame', 'version', 'bibl', 'first', 'book', 'mose', 'call', 'genesi', 'second', 'book', 'mose', 'call', 'exodu', 'third', 'book', 'mose', 'call', 'leviticu', 'fourth', 'book', 'mose', 'call', 'number', 'fifth', 'book', 'mose', 'call', 'deuteronomi']


Compare the processed file to the new file. Are there any surprises? Discuss
what kind of modifications in preprocessing could be applied. For example:
- Additional words/terms to be filtered out
- Special tokenization
- Additional normalization to some terms

# Part 2: Text Laws