In [12]:
import glob
import pickle
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from processquery import process


In [13]:
# Example of defining the retrieve_similar_documents function
def retrieve_similar_documents(query_id, query, qrels, similarity_scores, tfidf_matrix, docs, vectorizer):
    sorted_indices = np.argsort(similarity_scores.reshape(1, -1), axis=1)[0, ::-1]
    threshold = 0
    num_relevant = 0
    returned_docs = []

    for i, idx in enumerate(sorted_indices):
        doc_id = list(docs.keys())[idx]
        if doc_id in qrels[query_id]:
            num_relevant += 1
        precision = num_relevant / (i + 1)
        returned_docs.append(doc_id)
        if precision >= threshold and i + 1 >= 5:
            break

    num_returned = i + 1
    relevant_documents = [doc_id for doc_id in returned_docs if doc_id in qrels[query_id]]
    Total = len(qrels[query_id])

    return num_relevant, num_returned, precision, Total, relevant_documents

# Modified read_docs_from_folder function
def read_docs_from_folder(folder_path):
    docs = {}
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    doc_text = file.read()
                doc_id = os.path.splitext(file_name)[0]  # Use the file name without extension as the document ID
                docs[doc_id] = doc_text
            except UnicodeDecodeError as e:
                print(f"Error reading {file_name}: {e}")
    return docs


In [14]:
# def calculate_tfidf_matrix(docs):
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform(docs.values())
#     return tfidf_matrix, vectorizer

In [15]:
def read_qrels(qrels_file_paths):
    qrels = {}
    for qrels_file_path in qrels_file_paths:
        with open(qrels_file_path, 'r') as qrels_file:
            for line in qrels_file:
                line_parts = line.strip().split(' ')
                if len(line_parts) != 4:
                    print("Invalid line format:", line)
                    continue

                query_id, _, doc_id, relevance = line_parts
                query_id = int(query_id)
                doc_idd = doc_id.split('_')[0]  # Extract the document ID without the suffix

                if query_id in qrels:
                    qrels[query_id][doc_idd] = relevance
                else:
                    qrels[query_id] = {doc_idd: relevance}

    return qrels


def read_queries(queries_file_path):
    queries = {}
    with open(queries_file_path, 'r') as queries_file:
        for line in queries_file:
            line_parts = line.strip().split(None, 1)
            if len(line_parts) < 2:
                continue  # Skip lines that don't have the expected format
            query_id = int(line_parts[0])
            query = line_parts[1]
            query = process(query)
            queries[query_id] = query

    return queries

In [16]:
def save_relevant_num(relevant_num_file_path, query_id, query, num_relevant, num_returned, precision, Total, relevant_documents, reciprocal_ranks):
    with open(relevant_num_file_path, 'a+') as relevant_num_file:
        relevant_num_file.write(f"Query ID: {query_id}\n")
        relevant_num_file.write(f"Query: {query}\n")
        relevant_num_file.write(f"Relevant Documents: {num_relevant}\n")
        relevant_num_file.write(f"Total Returned Documents: {num_returned}\n")
        relevant_num_file.write(f"Precision: {precision}\n")
        relevant_num_file.write(f"Total: {Total}\n")
        relevant_num_file.write("\n".join(relevant_documents))
        relevant_num_file.write("\n\n")

    return reciprocal_ranks

In [17]:
# Create relevant_num_file to store the results
relevant_num_file_path = r"C:\Users\user\Documents\IRSystem1\cluster_relevant_num_queries.txt"

In [18]:
# Read relevance judgments (qrels)
qrels_file_paths = glob.glob(r"C:\Users\user\Documents\IRSystem1\qrel1.txt")
qrels = read_qrels(qrels_file_paths)


In [19]:
# Read queries from file
queries_file_path = r"C:\Users\user\Documents\IRSystem1\queries1.txt"
queries = read_queries(queries_file_path)

In [20]:
# Load documents
with open(r'C:\Users\user\Documents\IRSystem1\docs12.pkl', 'rb') as file:
    docs = pickle.load(file)

In [21]:
# تحويل المستندات إلى صورة مرجعية باستخدام TfidfVectorizer
# tfidf_matrix, vectorizer = calculate_tfidf_matrix(docs)
with open(r'C:\Users\user\Documents\IRSystem1\tfidf_matrix_new.pkl', 'rb') as file:
    tfidf_matrix, vectorizer = pickle.load(file)

In [22]:
# Number of clusters
num_clusters = 4

# Create the KMeans model with an explicit n_init parameter
kmeans = KMeans(n_clusters=num_clusters, n_init=10)
kmeans.fit(tfidf_matrix)


In [None]:
for query_id, query in queries.items():
    print("Query ID:", query_id)
    print("Query:", query)

    if query_id not in qrels:
        print('*********************************************')
        print("No relevance judgments found for this query.")
        continue

    query_vector = vectorizer.transform([query])

    # Calculate similarity between query and each cluster
    for i, cluster_center in enumerate(kmeans.cluster_centers_):
        # Calculate similarity between query and cluster center using cosine_similarity
        similarity_score = cosine_similarity(query_vector.reshape(1, -1), cluster_center.reshape(1, -1))[0, 0]
        # Get documents matrix in the cluster
        cluster_docs = tfidf_matrix[kmeans.labels_ == i]
        # Print cluster ID and similarity score
        print("Cluster ID: {}".format(i))
        print("Similarity Score between Query and Cluster_center-{}: {}".format(i, similarity_score))
        # Get indices of rows that match the condition kmeans.labels_ == i
        doc_indices = np.where(kmeans.labels_ == i)[0]
        # Get document IDs associated with the cluster
        doc_ids = [list(docs.keys())[j] for j in doc_indices]
        # Print document content and similarity scores for each document in the cluster
        for j, (doc_id, doc_content) in enumerate(docs.items()):
            if doc_id in doc_ids:
                # Calculate similarity between document and cluster center using cosine_similarity
                doc_similarity_score = cosine_similarity(tfidf_matrix[j], query_vector.reshape(1, -1))[0, 0]
                # Print document ID, similarity score, and document content
                print("Document ID in Cluster-{}, Document-{}: {}".format(i, j, doc_id))
                print("Similarity Score between Document-{} in Cluster-{} and the query: {}".format(j, i, doc_similarity_score))
                print("Document Content in Cluster-{}, Document-{}: {}".format(i, j, doc_content))
                print("\n")

                # Retrieve similar documents
                num_relevant, num_returned, precision, Total, relevant_documents = retrieve_similar_documents(query_id, query, qrels, doc_similarity_score, cluster_docs, docs, vectorizer)

    # Save the results
    save_relevant_num(relevant_num_file_path, query_id, query, num_relevant, num_returned, precision, Total, relevant_documents)


Query ID: 3990512
Query: get concentr someth
Cluster ID: 0
Similarity Score between Query and Cluster_center-0: 0.3054591723609424
Document ID in Cluster-0, Document-14: 1000063_5
Similarity Score between Document-14 in Cluster-0 and the query: 0.04814849990587663
Document Content in Cluster-0, Document-14: complet bull world alway go warm cooler phase right get warm actual cooler long time ago reason vike abl get greenland easili world warmer less ice world might warm littl bit fault noth bad come


Document ID in Cluster-0, Document-33: 1000189_1
Similarity Score between Document-33 in Cluster-0 and the query: 0.028161211406828937
Document Content in Cluster-0, Document-33: co-sign could purchas car buy get loan lend intuit current hold titl would pay loan name place new one name co-sign anoth way reinforc first answer


Document ID in Cluster-0, Document-35: 1000236_1
Similarity Score between Document-35 in Cluster-0 and the query: 0.028686900670819783
Document Content in Cluster-0,