In [None]:
pip install pandas openpyxl

In [1]:
#1 Preprocessing

import json
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import pandas as pd

# Download NLTK data files (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


def preprocess(text):
    """
    Preprocess the input text by tokenizing, removing non-alphanumeric tokens, 
    converting to lowercase, removing stopwords, and stemming.
    """
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove non-alphanumeric and convert to lowercase
    tokens = [re.sub(r'\W+', '', token).lower() for token in tokens if token.isalnum()]
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

# Read and preprocess documents
corpus = {}
try:
    with open("corpus.jsonl", "r") as f:
        for line in f:
            # Parse the JSON line
            doc = json.loads(line)
            
            # Extract document ID and text (instead of abstract)
            if '_id' in doc and 'text' in doc:
                doc_id = doc['_id']  # Use '_id' as the document identifier
                text = doc['text']  # Use 'text' as the content to preprocess
                
                # Preprocess and store the result
                corpus[doc_id] = preprocess(text)
            else:
                print(f"Skipping document due to missing keys: {doc}")
except FileNotFoundError:
    print("Error: The file 'corpus.jsonl' was not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")

# Convert the processed corpus into a list of dictionaries for easy saving into an Excel file
data = []
for doc_id, tokens in corpus.items():
    data.append({"Document ID": doc_id, "Tokens": " ".join(tokens)})

# Create a DataFrame
df = pd.DataFrame(data)

# Save to Excel
excel_filename = "preprocessed_corpus.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Preprocessed data has been saved to {excel_filename}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Anas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Preprocessed data has been saved to preprocessed_corpus.xlsx


In [2]:
#2 Inverted indexing
import json
from collections import defaultdict

def invertIndex(corpus):
    
    inverted_index = defaultdict(set)  
    
    # Iterate over corpus to add each word to set
    for id, tokens in corpus.items():
        for token in tokens:
            inverted_index[token].add(id)
    
    return inverted_index


inverted_index = invertIndex(corpus)


# Output 
json_data = []
for word, doc_ids in inverted_index.items():
    print(f"{word}: {sorted(doc_ids)}")
    json_data.append({
        "word": word,
        "ids": ", ".join(map(str, sorted(doc_ids)))
    })

with open('inverted_index.json', 'w') as f:
    json.dump(json_data, f, indent=4)


alter: ['10071552', '10190462', '10314816', '1032372', '10354110', '10485142', '10559501', '10641715', '10697096', '10698739', '1084345', '10883736', '10906636', '11117679', '11156883', '11181416', '11271123', '11289247', '11344428', '11359243', '1156322', '11569583', '11578459', '11581157', '11674288', '11861374', '11992632', '12009265', '12039953', '1281769', '12903921', '13189693', '13256155', '13293033', '13400643', '13441537', '13466622', '13515165', '13573143', '1371440', '13765757', '1383826', '13906892', '14180565', '14188138', '1428830', '14311986', '14367469', '14402338', '14446279', '14475235', '1449692', '14501880', '14610165', '146653163', '1472815', '14782049', '14803797', '14848619', '14923462', '14924526', '15215393', '15425958', '15435343', '15521377', '15535511', '1554348', '15669393', '15685921', '15721252', '15727984', '1590744', '15975146', '16086778', '1617327', '16201748', '16389141', '16527698', '16546131', '167944455', '17123316', '17150648', '1727493', '174506

In [33]:
import json
import math
from collections import defaultdict
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

RUN_TAG = "run_name"

def load_inverted_index(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    inverted_index = {}
    for entry in data:
        inverted_index[entry["word"]] = [int(doc.strip()) for doc in entry["ids"].split(",") if doc.strip().isdigit()]
    return inverted_index

def load_queries(filename):
    queries = []
    with open(filename, 'r') as file:
        for line in file:
            entry = json.loads(line.strip())
            if int(entry["_id"]) % 2 == 1:
                queries.append((entry["_id"], entry["text"].lower()))
    return queries

def load_corpus(filename):
    df = pd.read_excel(filename, engine='openpyxl')
    corpus = {str(row["Document ID"]): row["Tokens"] for _, row in df.iterrows()}  # Convert IDs to strings
    return corpus

def load_test_queries(filename):
    test_df = pd.read_csv(filename, sep="\t")
    test_queries = test_df[["query-id", "corpus-id"]].astype(str).values.tolist()
    return test_queries

def compute_cosine_similarity(query, inverted_index, corpus):
    relevant_docs = set()
    for term in query.split():
        if term in inverted_index:
            relevant_docs.update(map(str, inverted_index[term]))
    
    relevant_docs = list(relevant_docs)
    if not relevant_docs:
        return []
    
    doc_texts = [corpus[doc_id] for doc_id in relevant_docs if doc_id in corpus]
    doc_ids = [doc_id for doc_id in relevant_docs if doc_id in corpus]
    
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([query] + doc_texts)
    query_vector = vectors[0]
    doc_vectors = vectors[1:]
    similarities = cosine_similarity(query_vector, doc_vectors)[0]
    
    ranked_results = sorted(zip(doc_ids, similarities), key=lambda x: x[1], reverse=True)[:100]
    return ranked_results

def main():
    inverted_index = load_inverted_index("inverted_index.json")
    queries = load_queries("queries.jsonl")
    corpus = load_corpus("preprocessed_corpus.xlsx")
    test_queries = load_test_queries(r"qrels\test.tsv")

    with open("ranked_results.txt", "w") as ranked_file:
        for query_id, query in queries:
            ranked_docs = compute_cosine_similarity(query, inverted_index, corpus)
            for rank, (doc_id, score) in enumerate(ranked_docs, start=1):
                ranked_file.write(f"{query_id} Q0 {doc_id} {rank} {score:.4f} {RUN_TAG}\n")
    print("Ranked results saved to 'ranked_results.txt'")

    with open("Results", "w") as results_file:
        for query_id, doc_id in test_queries:
            query_text = next((q_text for q_id, q_text in queries if q_id == query_id), None)
            if query_text:
                ranked_docs = compute_cosine_similarity(query_text, inverted_index, corpus)
                for rank, (retrieved_doc_id, score) in enumerate(ranked_docs, start=1):
                    results_file.write(f"{query_id} Q0 {retrieved_doc_id} {rank} {score:.4f} {RUN_TAG}\n")
    print("Results saved to file 'Results'")

if __name__ == "__main__":
    main()


Ranked results saved to 'ranked_results.txt'
Results saved to file 'Results'
