In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

import re
import numpy as np
import os

from collections import Counter
import math

nltk.download('stopwords')
from nltk.corpus import stopwords as sw

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yfr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yfr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class TextProcessor:
    def __init__(self, contraction_mapping):
        self.lemmatizer = WordNetLemmatizer()
        self.contraction_mapping = contraction_mapping

    def preprocess_text(self, text):
        text = self._to_lower(text)
        text = self._expand_contractions(text)
        text = self._remove_punctuation(text)
        tokens = self._tokenize(text)
        tokens = self._lemmatize_tokens(tokens)
        return tokens

    def _to_lower(self, text):
        return text.lower()

    def _expand_contractions(self, text):
        for word, new_word in self.contraction_mapping.items():
            text = text.replace(word, new_word)
        return text

    def _remove_punctuation(self, text):
        return re.sub(r'[^\w\s]', '', text)

    def _tokenize(self, text):
        return word_tokenize(text)
    
    def _lemmatize_tokens(self, tokens):
        return [self.lemmatizer.lemmatize(token) for token in tokens]

In [4]:
# list top 50 common words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

directory_path = "C:/Users/yfr/Downloads/Capstone/data/tender_raw"
os.chdir(directory_path)


# read "all_file_content.txt" 
with open("all_file_content.txt", "r", encoding="utf-8") as input_file:
    text_data = input_file.readlines()


count_vectorizer = CountVectorizer(
    max_features=50,  
    stop_words="english",  
    token_pattern=r"(?u)\b\w\w+\b", 
)


word_counts = count_vectorizer.fit_transform(text_data)


feature_names = count_vectorizer.get_feature_names_out()


word_counts_df = pd.DataFrame(data=word_counts.toarray(), columns=feature_names)


top_50_common_words = word_counts_df.sum(axis=0).sort_values(ascending=False).head(50)
print("Top 50 Common Words:")
print(top_50_common_words)


Top 50 Common Words:
shall             1179313
contractor         967271
contract           675879
general            556152
provide            546267
works              511310
work               496976
tender             492112
site               478219
services           449063
required           440274
requirements       401751
principal          381564
including          340190
equipment          332259
clause             324563
building           317133
10                 314273
following          310297
wa                 297076
superintendent     283813
date               280514
drawings           267129
construction       266744
time               257678
project            257517
tenderer           255049
information        253219
mm                 252430
page               251416
materials          249724
specification      239940
authority          238911
use                236412
type               232379
water              232223
accordance         229518
material         

In [5]:
# remove common words

processed_text_data = []

for line in text_data:
    words = line.split()
    # filter words thats not in common_words
    filtered_words = [word for word in words if word not in top_50_common_words.index]
    processed_line = " ".join(filtered_words)
    processed_text_data.append(processed_line)

# save processed text in "all_file_content.txt"
with open("all_file_content_processed.txt", "w", encoding="utf-8") as output_file:
    output_file.writelines(processed_text_data)


In [7]:
# TF-IDF after common words removed

from sklearn.feature_extraction.text import TfidfVectorizer

with open("all_file_content_processed.txt", "r", encoding="utf-8") as input_file:
    text_data_processed = input_file.readlines()


tfidf_vectorizer_processed = TfidfVectorizer(
    max_features=50,  
    stop_words=None,  # no stop words needed, common words alr removed
    token_pattern=r"(?u)\b\w\w+\b",
)


tfidf_matrix_processed = tfidf_vectorizer_processed.fit_transform(text_data_processed)

feature_names_processed = tfidf_vectorizer_processed.get_feature_names_out()

tfidf_df_processed = pd.DataFrame(data=tfidf_matrix_processed.toarray(), columns=feature_names_processed)

top_50_tfidf_words_processed = tfidf_df_processed.sum(axis=0).sort_values(ascending=False).head(50)
print("Top 50 Words by TF-IDF in Processed Text:")
print(top_50_tfidf_words_processed)


MemoryError: 