In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

import re
import numpy as np
import os

from collections import Counter
import math

nltk.download('stopwords')
from nltk.corpus import stopwords as sw
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yfr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yfr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
from nltk.corpus import stopwords

class TextProcessor:
    def __init__(self, contraction_mapping):
        self.lemmatizer = WordNetLemmatizer()
        self.contraction_mapping = contraction_mapping
        self.stop_words = set(stopwords.words("english"))

    def preprocess_text(self, text):
        if text is None:
            # 处理文本为 None 的情况，例如返回一个空字符串
            return ""

        text = self._to_lower(text)
        text = self._expand_contractions(text)
        text = self._remove_punctuation(text)
        tokens = self._tokenize(text)
        tokens = self._lemmatize_tokens(tokens)
        tokens = self._remove_stop_words(tokens)
        return tokens

    def _to_lower(self, text):
        return text.lower()

    def _expand_contractions(self, text):
        for word, new_word in self.contraction_mapping.items():
            text = text.replace(word, new_word)
        return text

    def _remove_punctuation(self, text):
        return re.sub(r'[^\w\s]', '', text)

    def _tokenize(self, text):
        return word_tokenize(text)
    
    def _lemmatize_tokens(self, tokens):
        return [self.lemmatizer.lemmatize(token) for token in tokens]

    def _remove_stop_words(self, tokens):
        return [token for token in tokens if token not in self.stop_words]


In [14]:
import pickle
import re

class Tender:
    def __init__(self, reference):
        self.reference = reference
        self.file_map = {}
        
    def clean_text(self, text):
    # convert from binary string if needed
        if type(text) == bytes:
            text = text.decode("utf-8")
        text = re.sub("[^a-zA-z0-9.,]", " ", text)
        text = re.sub("\\\\", " ", text) 
        text = re.sub("\s+", " ", text)
        text = re.sub("\.+", ".", text)
        return text   
    
    def add(self, file_name, content):
        if content == None:
            print(f"Warning: None content for ref:{self.reference}, fname:{file_name}")
        else:
            content = self.clean_text(content)
            
        if file_name in self.file_map:
            # hopefully wont happen
            print(f"Warning: duplicate file name added for ref:{self.reference} fname:{file_name}")
        else:
            self.file_map[file_name] = content
    
    def save(self):
        with open(f"{self.reference}.pickle", 'wb') as file_handle:
            pickle.dump(self.file_map, file_handle, protocol=pickle.HIGHEST_PROTOCOL)
           
    @staticmethod
    def __correct_handle(reference):
        if ".pickle" in reference: # assume its a fpath, dont change
            return reference
        else: # try ref.pickle
            return f"{reference}.pickle"
        
    @staticmethod
    def exists(reference):
        return os.path.exists(Tender._Tender__correct_handle(reference))
            
    @staticmethod
    def load(reference):
        if Tender.exists(reference):
            with open(Tender._Tender__correct_handle(reference), 'rb') as file_handle:
                t = Tender(reference)
                t.file_map = pickle.load(file_handle)
                return t
        else:
            return None

In [11]:
directory_path = "C:/Users/yfr/Downloads/Capstone/data/tender_raw"
os.chdir(directory_path)

In [26]:
# list top 50 common words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# read "all_file_content.txt" 
with open("all_file_content.txt", "r", encoding="utf-8") as input_file:
    text_data = input_file.readlines()


count_vectorizer = CountVectorizer(
    max_features=50,  
    stop_words="english",  
    token_pattern=r"(?u)\b\w\w+\b", 
)


word_counts = count_vectorizer.fit_transform(text_data)


feature_names = count_vectorizer.get_feature_names_out()


word_counts_df = pd.DataFrame(data=word_counts.toarray(), columns=feature_names)


top_50_common_words = word_counts_df.sum(axis=0).sort_values(ascending=False).head(50)

Top 50 Common Words:


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

file_names = os.listdir(directory_path)

def calculate_tfidf(file_names, top_50_common_words):
    text_processor = TextProcessor(contraction_mapping={})
    text_data_processed = []

    for file_name in file_names:
        if file_name.endswith('.txt'):
            with open(file_name, "r", encoding="utf-8") as txt_file:
                content = txt_file.read()
                
                # Text preprocessing
                preprocessed_content = " ".join(text_processor.preprocess_text(content))
                
                # Remove common words
                for common_word in top_50_common_words.index:
                    preprocessed_content = preprocessed_content.replace(common_word, "")
                
                text_data_processed.append(preprocessed_content)

    # Combine into one corpus
    corpus = "\n".join(text_data_processed)

    tfidf_vectorizer = TfidfVectorizer(max_features=30)

    tfidf_matrix = tfidf_vectorizer.fit_transform([corpus])

    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Save TF-IDF values
    tfidf_df = pd.DataFrame(data=tfidf_matrix.toarray(), columns=feature_names)

    return tfidf_df

In [35]:
result_df = calculate_tfidf(file_names, top_50_common_words)
print("Top 30 TF-IDF Terms:")
print(result_df)

Top 30 TF-IDF Terms:
       area  condition     cost    detail  document      door   drawing  \
0  0.171187   0.173271  0.12886  0.169471  0.145646  0.151471  0.214484   

         er   finish      form  ...   product  requirement  schedule  \
0  0.202853  0.13389  0.129162  ...  0.140816     0.344155  0.142554   

   standard       sub   surface    system       ter      test    within  
0  0.163235  0.168438  0.164832  0.306657  0.145444  0.133112  0.153049  

[1 rows x 30 columns]
