Library Imports


In [1]:
import os
import re
import nltk
import numpy as np
import pandas as pd
import openpyxl # type: ignore
from nltk.stem import PorterStemmer

`DATA_URL` contains the file path to the Cranfield collection documents.


In [2]:
DATA_URL  = "../data/documents/Cranfield collection"

loads stopwords from a file


In [3]:
def load_stopwords(file_path):
    try:
        with open(file_path, "r") as stop_file:
            return stop_file.read().split(",")
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []

stopwords_file_path = "../data/stopword/words.txt"
stopwords = load_stopwords(stopwords_file_path)


### Tokenization and Stemming

tokenizes the input text, removes stopwords, and then stems each token.
Returns:
A string containing the stemmed tokens.


In [4]:
import re
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def tokenization_and_stemming(text):
    tokens = per_tokenize(text)
    filtered_tokens = remove_stopwords(tokens)
    stems = [stemmer.stem(token) for token in filtered_tokens]
    return " ".join(stems)

def per_tokenize(text):
    text = re.sub(r"[^\w\s]", " ", text)
    # text = ''.join(char if char.isalpha() else ' ' for char in text)
    text = text.lower()
    tokens = text.split()
    return tokens

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token not in stopwords]
    return filtered_tokens


parses text documents located in the specified directory.


In [5]:
def parse_text(directory=DATA_URL):
    documents = []
    for filename in os.listdir(directory):
        if not filename.startswith("cran.all") or filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), "r") as file:
            text = file.read()
            documents = text.split(".I")
    return documents[1:]
documents = parse_text()
# documents.remove(documents[0])

stem the data and then tokenize the first document


In [6]:
stem_data = []
for document in documents:
    stem_data.append(tokenization_and_stemming(document))

creates a list of word count dictionaries from a list of stemmed documents.


In [7]:
def create_word_count_list():
    word_count_list = []
    for document in stem_data:
        word_count = {}
        tokens = document.split()
        for token in tokens:
            token = token.strip('.,?!";:')
            word_count[token] = word_count.get(token, 0) + 1
        if word_count:
            del word_count[next(iter(word_count))]
        word_count_list.append(word_count)
    return word_count_list

takes a list of word count tuples and returns a set of unique words.


In [8]:
def unique_words(word_count_list):
    unique_words = set()
    for word_count in word_count_list:
        unique_words.update(word_count.keys())
    return unique_words

creates a word count matrix from a list of word count dictionaries.


In [9]:
# def write_to_excel(word_count_list, file_name):
#         wb = openpyxl.Workbook()
#         sheet = wb.active
#         sheet.title = "Word Count Data"
#         all_words = set()
#         for word_count in word_count_list:
#             all_words.update(word_count.keys())

#         all_words = sorted(all_words)
#         sheet.cell(row=1, column=1).value = Words
#         for col, document in enumerate(range(2, len(word_count_list) + 2), start=2):
#             sheet.cell(row=1, column=col).value = f"Document {document - 1}"
#         for row, word in enumerate(all_words, start=2):
#             sheet.cell(row=row, column=1).value = word
#             for col, word_count in enumerate(word_count_list, start=2):
#                 count = word_count.get(word, 0)  # If word not found, default to 0
#                 sheet.cell(row=row, column=col).value = count

#         wb.save(file_name)


In [10]:
def generate_matrix(word_doc_list):
    # Create a set of all unique words across all documents
    all_words = set(unique_words(word_doc_list))
    # Determine the dimensions of the matrix
    num_rows = len(all_words) + 1  # +1 for the header row
    num_cols = len(word_doc_list) + 1  # +1 for the "Words" column
    # Create the matrix
    matrix = [[0] * num_cols for _ in range(num_rows)]
    # Populate the header row
    matrix[0][0] = "Words"
    for col in range(1, num_cols):
        matrix[0][col] = f"Document {col}"
    # Populate the word counts
    for row, word in enumerate(all_words, start=1):
        matrix[row][0] = word
        for col, word_count in enumerate(word_doc_list, start=1):
            count = word_count.get(word, 0)
            matrix[row][col] = count
    return matrix

### Generating Word Count Matrix and DataFrame

generate a word count matrix and create a DataFrame from it.


In [11]:
word_count_list = create_word_count_list()
unique_words_list = unique_words(word_count_list)
word_tf_matrix = generate_matrix(word_count_list)
print("Number of unique words in all documents:", len(unique_words_list))
df = pd.DataFrame(word_tf_matrix[1:], columns=word_tf_matrix[0])
filtered_df = df[df['Words'].str.contains('experiment')]
# filtered_df.to_excel("data.xlsx", index=False)
filtered_df

Number of unique words in all documents: 6315


Unnamed: 0,Words,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,...,Document 1391,Document 1392,Document 1393,Document 1394,Document 1395,Document 1396,Document 1397,Document 1398,Document 1399,Document 1400
1595,experiment,3,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0


In [12]:
import math

def calculate_log_tf(tf):
    if tf == 0:
        return 0  # Logarithm of 0 is undefined
    else:
        return 1 + math.log(tf)


In [13]:
for i in range(1,len(word_tf_matrix)):
    for j in range(1,len(word_tf_matrix[i])):
        tf = word_tf_matrix[i][j]
        word_tf_matrix[i][j] = calculate_log_tf(tf)