In [25]:
# Imports and downloads
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

import os, re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Constants
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()
FOLDER_PATH = r"C:\Users\Acer\Desktop\articles"  

# Preprocessing
def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    tokens = word_tokenize(text)
    tokens = [LEMMATIZER.lemmatize(t) for t in tokens if t not in STOPWORDS and len(t) > 1]
    return " ".join(tokens)

# Load documents (ignore query/result files)
def load_documents(folder_path):
    docs = {}
    doc_id_to_filename = {}
    doc_id = 0
    ignore_files = {"queries.txt", "query_results.txt", "similarity_results.txt"}

    print(f"Scanning folder: {folder_path}")
    for file in os.listdir(folder_path):
        if file.endswith(".txt") and file not in ignore_files:
            path = os.path.join(folder_path, file)
            with open(path, "r", encoding="utf-8") as f:
                content = f.read()
                docs[doc_id] = content
                doc_id_to_filename[doc_id] = file
                print(f"Loaded Doc {doc_id}: {file}")
                doc_id += 1
    print(f"Total documents loaded: {len(docs)}\n")
    return docs, doc_id_to_filename

# Extract queries from documents (first sentence of each doc as query)
def generate_queries_from_docs(docs):
    queries = []
    for doc_id, content in docs.items():
        sentences = sent_tokenize(content)
        if sentences:
            queries.append(sentences[0])  # Take first sentence as query
    print(f"Generated {len(queries)} queries from your documents.\n")
    return queries

# Compute TF-IDF and cosine similarity (console only)
def compute_similarity(docs, queries, doc_id_to_filename):
    cleaned_docs = [preprocess(text) for text in docs.values()]
    vectorizer = TfidfVectorizer()
    doc_vectors = vectorizer.fit_transform(cleaned_docs)

    for query in queries:
        cleaned_query = preprocess(query)
        query_vector = vectorizer.transform([cleaned_query])
        scores = cosine_similarity(query_vector, doc_vectors)[0]
        ranking = sorted(list(enumerate(scores)), key=lambda x: x[1], reverse=True)

        print(f"\nQuery: {query}")
        print("-"*60)
        for rank, (doc_id, score) in enumerate(ranking, start=1):
            filename = doc_id_to_filename[doc_id]
            print(f"Rank {rank}: {filename}  |  Similarity = {score:.4f}")

# Main
def main():
    docs, doc_id_to_filename = load_documents(FOLDER_PATH)
    queries = generate_queries_from_docs(docs)
    compute_similarity(docs, queries, doc_id_to_filename)

if __name__ == "__main__":
    main()


Scanning folder: C:\Users\Acer\Desktop\articles
Loaded Doc 0: 1_weforum_iot.txt
Loaded Doc 1: 2_aws_iaas.txt
Loaded Doc 2: 3_trm_ransomware_2024.txt
Loaded Doc 3: 4_alephium_sharding.txt
Loaded Doc 4: 5_stockholm_climate_action_2030.txt
Loaded Doc 5: 6_cybersecuritydive_cisa.txt
Total documents loaded: 6

Generated 6 queries from your documents.


Query: 6 ways the Internet of Things is improving our lives
It's a small world.
------------------------------------------------------------
Rank 1: 1_weforum_iot.txt  |  Similarity = 0.3007
Rank 2: 2_aws_iaas.txt  |  Similarity = 0.0224
Rank 3: 3_trm_ransomware_2024.txt  |  Similarity = 0.0135
Rank 4: 5_stockholm_climate_action_2030.txt  |  Similarity = 0.0083
Rank 5: 4_alephium_sharding.txt  |  Similarity = 0.0000
Rank 6: 6_cybersecuritydive_cisa.txt  |  Similarity = 0.0000

Query: What is IaaS (Infrastructure as a Service)?
------------------------------------------------------------
Rank 1: 2_aws_iaas.txt  |  Similarity = 0.6136
Rank 2: 6

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
