In [None]:
!pip install transformers torch pandas rank-bm25 sentence-transformers playsound gtts pickle-mixin


In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
from rank_bm25 import BM25Okapi
import pickle
from gtts import gTTS
from playsound import playsound
import time

# Helper function for progress updates
def update_checkpoint(message):
    print(f"[{time.strftime('%H:%M:%S')}] {message}")


In [4]:
def build_or_load_bm25(dataset_path, cache_path="bm25_cache.pkl"):
    try:
        # Try loading cached BM25 model and tokenized titles
        with open(cache_path, "rb") as f:
            bm25, df = pickle.load(f)
        update_checkpoint("Loaded BM25 model from cache.")
    except FileNotFoundError:
        # If no cache is found, build BM25 from scratch and save it
        update_checkpoint("Building BM25 model from scratch...")
        df = pd.read_csv(dataset_path, low_memory=False)
        df['Title'] = df['Title'].astype(str).fillna("")
        tokenized_titles = [title.split(" ") for title in df['Title'] if isinstance(title, str)]
        bm25 = BM25Okapi(tokenized_titles)
        # Cache the BM25 model and tokenized titles for future use
        with open(cache_path, "wb") as f:
            pickle.dump((bm25, df), f)
        update_checkpoint("BM25 model built and saved to cache.")
    return bm25, df

def retrieve_answer(query, bm25, df):
    try:
        update_checkpoint("Searching for matching title...")
        query_tokens = query.split(" ")
        top_n_titles = bm25.get_top_n(query_tokens, df['Title'], n=1)  # Find the closest matching title

        if len(top_n_titles) > 0:
            # Retrieve the answer corresponding to the top title match
            matched_title = top_n_titles[0]
            answer = df[df['Title'] == matched_title]['Answer'].values[0]
            update_checkpoint(f"Match found: {matched_title}. Returning answer.")
            return answer
        else:
            update_checkpoint("No matching title found.")
            return None
    except Exception as e:
        update_checkpoint(f"Error during retrieval: {str(e)}")
        return None


In [5]:
def generate_answer_gpt2(query, max_length=150):
    try:
        update_checkpoint("Generating new answer using GPT-2...")
        # Load GPT-2 model and tokenizer
        model = GPT2LMHeadModel.from_pretrained("gpt2")
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        # Encode the query
        inputs = tokenizer.encode(query, return_tensors="pt")

        # Generate text based on the query
        outputs = model.generate(inputs, max_length=max_length, do_sample=True, temperature=0.7)

        # Decode the generated text
        generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        update_checkpoint("New answer generated successfully.")
        return generated_answer
    except Exception as e:
        update_checkpoint(f"Error during answer generation: {str(e)}")
        return None


In [6]:
def text_to_speech(text, filename="output.mp3"):
    try:
        update_checkpoint("Converting answer to speech...")
        speech = gTTS(text=text, lang='en', slow=False)
        speech.save(filename)
        update_checkpoint(f"Speech saved as '{filename}'.")

        # Play the mp3 file
        playsound(filename)
    except Exception as e:
        update_checkpoint(f"TTS error: {str(e)}")


In [8]:
def main_pipeline():
    # Step 1: Capture Query Input
    query_text = input("Enter your query (title): ")

    if query_text:
        # Step 2: Retrieve Relevant Answer from Dataset
        bm25, df = build_or_load_bm25("data\\new_csv_file.csv")
        retrieved_answer = retrieve_answer(query_text, bm25, df)

        # Step 3: If no answer is found, generate a new answer
        if retrieved_answer:
            answer = retrieved_answer
            update_checkpoint(f"Retrieved answer: {answer}")
        else:
            answer = generate_answer_gpt2(query_text, max_length=300)  # Generate a longer answer
            update_checkpoint(f"Generated answer: {answer}")

        if answer:
            # Step 4: Optionally convert the answer to speech
            text_to_speech(answer)

# Run the full pipeline
main_pipeline()


[08:13:46] Loaded BM25 model from cache.
[08:13:46] Searching for matching title...
[08:13:48] Match found: EU affirms 'credible poll' conditions'. Returning answer.
[08:13:48] Retrieved answer: Dhaka, Dec 29 (bdnews24.com)--Head of the European Union election observation mission (EOM) Alexander Graf Lambdorff Monday said "conditions for a credible poll" were in Bangladesh. After witnessing some polling stations and vote counting at Dhaka University, a satisfied Lambdorff said if the polling across the country was like that of the centres he visited, the 150 member EOM would be happy. "I have followed the process indeed from morning to evening and our team will continue to follow even after the counting when the results will be transferred to the returning officers," Lambdorff told reporters watching vote counting at Curzon Hall at Dhaka University. "I think that what we have seen on polling day so far has shown that the conditions for a credible poll in Bangladesh are in place," he sa