In [81]:
import os
import re
import nltk
import openpyxl # type: ignore
from nltk.stem import PorterStemmer

In [82]:
DATA_URL  = "../data/documents/Cranfield collection"

Create Stemmer


In [83]:
stemmer = PorterStemmer()

In [84]:
def load_stopwords(file_path):
    try:
        with open(file_path, "r") as stop_file:
            content = stop_file.read()
            stopwords = content.split(",")
            return stopwords
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []

stopwords_file_path = "../data/stopword/words.txt"
stopwords = load_stopwords(stopwords_file_path)


In [85]:
def tokenization_and_stemming(text):
    tokens = per_tokenize(text)
    filtered_tokens = remove_stopwords(tokens)
    stems = [stemmer.stem(token) for token in filtered_tokens]
    return " ".join(stems)

def per_tokenize(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    tokens = text.split()
    return tokens


def remove_stopwords(tokens):
    stop_words = set(stopwords)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

Read Documents


In [86]:
import os
def parse_text(directory=DATA_URL):
    documents = []
    for filename in os.listdir(directory):
        if not filename.startswith("cran.all") or filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), "r") as file:
            text = file.read()
            documents = text.split(".I")
    return documents
documents = parse_text()
documents.remove(documents[0])

In [87]:
stem_data = []
for document in documents:
    stem_data.append(tokenization_and_stemming(document))
per_tokenize(stem_data[0])

['1',
 'experiment',
 'investig',
 'aerodynam',
 'wing',
 'slipstream',
 'brenckmanm',
 'sc',
 '25',
 '1958',
 '324',
 'experiment',
 'investig',
 'aerodynam',
 'wing',
 'slipstream',
 'experiment',
 'studi',
 'wing',
 'propel',
 'slipstream',
 'order',
 'determin',
 'spanwis',
 'distribut',
 'lift',
 'increas',
 'slipstream',
 'angl',
 'attack',
 'wing',
 'free',
 'stream',
 'slipstream',
 'veloc',
 'ratio',
 'intend',
 'evalu',
 'basi',
 'theoret',
 'treatment',
 'problem',
 'compar',
 'span',
 'load',
 'curv',
 'support',
 'evid',
 'substanti',
 'lift',
 'increment',
 'produc',
 'slipstream',
 'destal',
 'boundarylayercontrol',
 'integr',
 'remain',
 'lift',
 'increment',
 'subtract',
 'destal',
 'lift',
 'agre',
 'potenti',
 'flow',
 'theori',
 'empir',
 'evalu',
 'destal',
 'effect',
 'specif',
 'configur',
 'experi']

In [88]:
def create_word_count_list():
    word_count_list = []
    for document in stem_data:
        word_count = {}
        tokens = document.split()
        for token in tokens:
            token = token.strip('.,?!";:')
            if token in word_count:
                word_count[token] += 1
            else:
                word_count[token] = 1
        if word_count:
            del word_count[next(iter(word_count))]
        word_count_list.append(word_count)
    return word_count_list

In [89]:
def unique_words(word_count_list):
    unique_words = []
    for word_count in word_count_list:
        unique_words.extend(word_count)
    return set(unique_words)

In [90]:
import openpyxl

def write_to_excel(word_count_list, file_name):
    wb = openpyxl.Workbook()
    sheet = wb.active
    sheet.title = "Word Count Data"
    all_words = set()
    for word_count in word_count_list:
        all_words.update(word_count.keys())
    all_words = sorted(all_words)
    sheet.cell(row=1, column=1).value = "Words"
    for col, document in enumerate(range(2, len(word_count_list) + 2), start=2):
        sheet.cell(row=1, column=col).value = f"Document {document - 1}"
    for row, word in enumerate(all_words, start=2):
        sheet.cell(row=row, column=1).value = word
        for col, word_count in enumerate(word_count_list, start=2):
            count = word_count.get(word, 0)  # If word not found, default to 0
            sheet.cell(row=row, column=col).value = count

    wb.save(file_name)

In [91]:

word_count_list = create_word_count_list()
unique_words_list = unique_words(word_count_list)
print_dic(word_count_list)


Document 1:
experiment: 3
investig: 2
aerodynam: 2
wing: 4
slipstream: 6
brenckmanm: 1
sc: 1
25: 1
1958: 1
324: 1
studi: 1
propel: 1
order: 1
determin: 1
spanwis: 1
distribut: 1
lift: 4
increas: 1
angl: 1
attack: 1
free: 1
stream: 1
veloc: 1
ratio: 1
intend: 1
evalu: 2
basi: 1
theoret: 1
treatment: 1
problem: 1
compar: 1
span: 1
load: 1
curv: 1
support: 1
evid: 1
substanti: 1
increment: 2
produc: 1
destal: 3
boundarylayercontrol: 1
integr: 1
remain: 1
subtract: 1
agre: 1
potenti: 1
flow: 1
theori: 1
empir: 1
effect: 1
specif: 1
configur: 1
experi: 1


In [92]:
print_dic(word_count_list)
write_to_excel(word_count_list, "word_count_data.xlsx")
word_count_list[0]
print("Number of unique words in all documents:", len(unique_words_list))


Document 1:
experiment: 3
investig: 2
aerodynam: 2
wing: 4
slipstream: 6
brenckmanm: 1
sc: 1
25: 1
1958: 1
324: 1
studi: 1
propel: 1
order: 1
determin: 1
spanwis: 1
distribut: 1
lift: 4
increas: 1
angl: 1
attack: 1
free: 1
stream: 1
veloc: 1
ratio: 1
intend: 1
evalu: 2
basi: 1
theoret: 1
treatment: 1
problem: 1
compar: 1
span: 1
load: 1
curv: 1
support: 1
evid: 1
substanti: 1
increment: 2
produc: 1
destal: 3
boundarylayercontrol: 1
integr: 1
remain: 1
subtract: 1
agre: 1
potenti: 1
flow: 1
theori: 1
empir: 1
effect: 1
specif: 1
configur: 1
experi: 1


In [None]:
print("Number of unique words in all documents:", len(unique_words_list))

Number of unique words in all documents: 8626
