In [11]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
import nltk

In [12]:
def setup_nltk_data():
    custom_nltk_path = "C:\\Users\\aadhi\\Desktop\\University 4th Year\\Statistical NLP\\SNLP_Project\\.venv\\nltk_data"
    os.makedirs(custom_nltk_path, exist_ok=True)
    nltk.download('punkt', download_dir=custom_nltk_path)
    nltk.download('punkt_tab', download_dir=custom_nltk_path)

    # ✅ Correct: Add the root nltk_data path
    if custom_nltk_path not in nltk.data.path:
        nltk.data.path.append(custom_nltk_path)


setup_nltk_data()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aadhi\Desktop\University 4th Year\Statistical
[nltk_data]     NLP\SNLP_Project\.venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aadhi\Desktop\University 4th Year\Statistical
[nltk_data]     NLP\SNLP_Project\.venv\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [13]:
#nltk.data.path.append("C:\\Users\\aadhi\\Desktop\\University 4th Year\\Statistical NLP\\SNLP_Project\\.venv\\nltk_data\\tokenizers")
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def remove_similar_sentences(context, needle, threshold=0.8):
    sentences = sent_tokenize(context)
    needle_embedding = model.encode(needle, convert_to_tensor=True)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    cosine_scores = util.cos_sim(needle_embedding, sentence_embeddings)[0]

    removed = []
    kept = []

    for sent, score in zip(sentences, cosine_scores):
        if score >= threshold:
            removed.append(sent)
        else:
            kept.append(sent)

    return ' '.join(kept), removed

def clean_files(df, column_name):
    """ This uses above function to actually remove all the sentences from the context files """
    # Loop through your DataFrame rows
    for idx, row in df.iterrows():
        
        file_name = row[column_name]  # assumes you have full or relative path here
        if column_name == 'context_relevant':
            context_dir = os.path.join("..", "haystack", "relevant")
        else:
            context_dir = os.path.join("..", "haystack", "irrelevant")
            
        file_path = os.path.join(context_dir, file_name)
        needle = row['needle']

        # Check file exists
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue

        # Read the context from file
        with open(file_path, 'r', encoding='utf-8') as f:
            context = f.read()

        # Clean context
        cleaned_context, removed_sentences = remove_similar_sentences(context, needle)

        # Print removed sentences
        print(f"\n🧹 File: {file_path}")
        for s in removed_sentences:
            print(f"Removed: {s}")

        # Overwrite the file with cleaned context
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_context)


In [14]:
df = pd.read_excel("FreshQADataset_with_misleading.xlsx")  # Load your DataFrame

In [15]:
clean_files(df, 'context_relevant')
clean_files(df, 'context_irrelevant')


🧹 File: ..\haystack\relevant\43.txt

🧹 File: ..\haystack\relevant\44.txt

🧹 File: ..\haystack\relevant\92.txt

🧹 File: ..\haystack\relevant\95.txt
Removed: No candidate received an outright majority of the vote.

🧹 File: ..\haystack\relevant\96.txt

🧹 File: ..\haystack\relevant\121.txt

🧹 File: ..\haystack\relevant\122.txt

🧹 File: ..\haystack\relevant\123.txt

🧹 File: ..\haystack\relevant\124.txt

🧹 File: ..\haystack\relevant\155.txt

🧹 File: ..\haystack\relevant\156.txt

🧹 File: ..\haystack\relevant\157.txt

🧹 File: ..\haystack\relevant\160.txt

🧹 File: ..\haystack\relevant\162.txt

🧹 File: ..\haystack\relevant\163.txt

🧹 File: ..\haystack\relevant\164.txt

🧹 File: ..\haystack\relevant\165.txt

🧹 File: ..\haystack\relevant\166.txt

🧹 File: ..\haystack\relevant\167.txt

🧹 File: ..\haystack\relevant\168.txt

🧹 File: ..\haystack\relevant\170.txt

🧹 File: ..\haystack\relevant\172.txt

🧹 File: ..\haystack\relevant\173.txt

🧹 File: ..\haystack\relevant\175.txt

🧹 File: ..\haystack\relevan

In [16]:
json_output = df.to_json(orient="records", indent=4, force_ascii=False)
with open("context.json", "w", encoding="utf-8") as f:
    f.write(json_output)