In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt')

In [None]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def remove_similar_sentences(context, needle, threshold=0.8):
    sentences = sent_tokenize(context)
    needle_embedding = model.encode(needle, convert_to_tensor=True)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    cosine_scores = util.cos_sim(needle_embedding, sentence_embeddings)[0]

    removed = []
    kept = []

    for sent, score in zip(sentences, cosine_scores):
        if score >= threshold:
            removed.append(sent)
        else:
            kept.append(sent)

    return ' '.join(kept), removed

def clean_files(df, column_name):
    """ This uses above function to actually remove all the sentences from the context files """
    # Loop through your DataFrame rows
    for idx, row in df.iterrows():
        
        file_name = row[column_name]  # assumes you have full or relative path here
        if column_name == 'context_relevant':
            context_dir = os.path.join("..", "haystack", "relevant")
        else:
            context_dir = os.path.join("..", "haystack", "irrelevant")
            
        file_path = os.path.join(context_dir, file_name)
        needle = row['needle']

        # Check file exists
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue

        # Read the context from file
        with open(file_path, 'r', encoding='utf-8') as f:
            context = f.read()

        # Clean context
        cleaned_context, removed_sentences = remove_similar_sentences(context, needle)

        # Print removed sentences
        print(f"\n🧹 File: {file_path}")
        for s in removed_sentences:
            print(f"Removed: {s}")

        # Overwrite the file with cleaned context
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_context)


In [None]:
df = pd.read_excel("FreshQADataset_with_misleading.xlsx")  # Load your DataFrame

In [None]:
clean_files(df, 'context_relevant')
clean_files(df, 'context_irrelevant')