In [19]:
def process_question(text, new_text=None, dictionary=False, temp_dict=None):
    lines = text.strip().split('\n')
    question = lines[0]  # The question line
    choices = lines[1:]  # The choice lines
    
    parsed_choices = []
    for choice in choices:
        parts = choice.split('] ')
        label = parts[0][1:]  # Remove '[' for dictionary keys
        value = parts[1].replace("[ANSWER]", "").strip()
        is_answer = "[ANSWER]" in parts[1]
        parsed_choices.append([label, value, is_answer])
    
    # Identify duplicates
    seen = {}
    duplicates = set()
    for _, value, _ in parsed_choices:
        if value in seen:
            duplicates.add(value)
        else:
            seen[value] = True
    
    # Replace the first non-answer duplicate if new_text is provided
    if new_text:
        for value in duplicates:
            for i, (label, choice_value, is_answer) in enumerate(parsed_choices):
                if value == choice_value and not is_answer:
                    parsed_choices[i][1] = new_text
                    break  # Stop after replacing the first non-answer duplicate for this value

    # Construct the new choices text and optionally create a dictionary
    if dictionary:
        # Initialize or use the provided dictionary
        question_dict = temp_dict if temp_dict is not None else {}
        question_dict["question"] = question
        for label, value, is_answer in parsed_choices:
            if is_answer:
                value += " [ANSWER]"
            question_dict[label] = value
        return question_dict
    else:
        new_choices = [f"[{label}] {value}{' [ANSWER]' if is_answer else ''}" for label, value, is_answer in parsed_choices]
        updated_text = question + "\n" + "\n".join(new_choices)
        return updated_text if new_text else (list(duplicates) if duplicates else [])




In [None]:
# Connect to MongoDB
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access an environment variable
password = os.getenv('MONGO')

uri = f"mongodb+srv://baderalotaibi3:{password}@cluster0.od393y9.mongodb.net/?retryWrites=true&w=majority"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
db = client["WIKIQUIZ"]
collection = db["WikiQuizEnApi"]
FinalQuestions = db["FinalQuestions"]

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")

except Exception as e:
    print(e)


In [20]:

def removeDublication():
    completed_docs = FinalQuestions.count_documents({})
    All_docs = collection.count_documents({})
    for index,i in enumerate(collection.find().skip(completed_docs)):
        
        percentage_processed = ( index / All_docs) * 100
        print(f"Processed {index+completed_docs} out of {All_docs} documents.")
        print(f"Percentage of documents processed: {percentage_processed:.2f}%")
        question = i['question']
        print("Question")
        print(question)
        response = process_question(question)
        print("Response")
        print(response)
        if not response:
            print("No duplicates found")
            response = process_question(question, dictionary=True, temp_dict=i)
            print(response)
            FinalQuestions.insert_one(i)
        else:
            print("Duplicates found")
            new_question = process_question(question, new_text="None Of The Above", dictionary=True, temp_dict=i)
            print(new_question)
            FinalQuestions.insert_one(new_question)
            

removeDublication()

Processed 475 out of 2440 documents.
Percentage of documents processed: 0.00%
Question
Question: What was the first 3D film seen in HD?
[A] The Diamond Wizard [ANSWER]
[B] Cease Fire
[C] The Little Shop of Horrors
[D] American Honey
process_question took 0.00 ms
Response
[]
No duplicates found
process_question took 0.00 ms
{'_id': ObjectId('65bec6c9b60456adae102c1d'), 'page_title': '3-D Film Preservation Fund', 'summary': 'The 3-D Film Preservation Fund (or 3DFPF) is a 501(c)(3) nonprofit corporation, dedicated to the preservation of stereoscopic motion pictures.  It was formed in 2006 by Jeff Joseph of Sabucat Productions, Robert Furmanek (brother of Ron, and Daniel Symmes of Dimension-3.In September 2006, the 3DFPF hosted the second World 3-D Exposition in Hollywood, California as part of a ten-day festival of 3-D movies, held at Grauman\'s Egyptian Theatre.Along with the favorites of the previous exposition were newly discovered features and shorts, and like the previous Expo, guest