In [116]:
import pandas as pd
import numpy as np
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from typing_extensions import Union, List, Tuple, Optional, Dict

from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    pipeline,
    T5ForQuestionAnswering,
    BertForQuestionAnswering
)
import torch
import time
from dataclasses import dataclass

In [117]:
df = pd.read_csv('../data/train_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Theme,Paragraph,Question,Answer_possible,Answer_text,Answer_start
0,2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,True,['2003'],[526]
1,6,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What album made her a worldwide known artist?,True,['Dangerously in Love'],[505]
2,7,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Who managed the Destiny's Child group?,True,['Mathew Knowles'],[360]
3,8,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé rise to fame?,True,['late 1990s'],[276]
4,9,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What role did Beyoncé have in Destiny's Child?,True,['lead singer'],[290]


In [118]:
paragraphs = df['Paragraph'].values
print(len(paragraphs))
paragraphs[:5]

75055


array(['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
       'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 199

In [119]:
paragraphs = np.unique(paragraphs)
print(len(paragraphs))

15555


In [120]:
beyonce_paragraphs = [para for para in paragraphs if "Beyoncé Giselle" in para]
for para in beyonce_paragraphs:
    print(para)

Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann "Tina" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé's name is a tribute to her mother's maiden name. Beyoncé's younger sister Solange is also a singer and a former member of Destiny's Child. Mathew is African-American, while Tina is of Louisiana Creole descent (with African, Native American, French, Cajun, and distant Irish and Spanish ancestry). Through her mother, Beyoncé is a descendant of Acadian leader Joseph Broussard. She was raised in a Methodist household.
Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of t

In [121]:
with open('context.txt', 'w', encoding='utf-8') as file:
    for paragraph in paragraphs:
        file.write(paragraph + '\n')

# Use ChromaDB to store embeddings

In [122]:
# Initialize ChromaDB client with the new architecture
client = chromadb.PersistentClient(
    path="./chroma_storage"  # Update this path as needed
)

# Create a collection (if it doesn't exist)
collection_name = "paragraph_collection"
try:
    collection = client.get_collection(collection_name)
except:
    collection = client.create_collection(collection_name)

In [123]:
print(type(paragraphs))
print(len(paragraphs))

<class 'numpy.ndarray'>
15555


In [124]:
def prepare_and_insert_data(
    paragraphs: Union[np.ndarray, List[str]], 
    collection: chromadb.Collection,
    batch_size: int = 500
) -> None:
    """
    Prepare and insert data into ChromaDB with proper handling of NumPy arrays
    
    Args:
        paragraphs: Array or list of text paragraphs
        collection: ChromaDB collection instance
        batch_size: Number of items to process in each batch
    """
    # Convert NumPy array to list if necessary
    if isinstance(paragraphs, np.ndarray):
        paragraphs = paragraphs.tolist()
    
    # Initialize the model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Compute embeddings
    embeddings = model.encode(paragraphs).tolist()
    
    # Create document IDs and metadata
    doc_ids = [f"doc_{i}" for i in range(len(paragraphs))]
    metadatas = [{"index": i} for i in range(len(paragraphs))]
    
    # Verify lengths
    print(f"Verification counts:")
    print(f"Paragraphs: {len(paragraphs)}")
    print(f"Embeddings: {len(embeddings)}")
    print(f"IDs: {len(doc_ids)}")
    print(f"Metadata: {len(metadatas)}")
    
    # Add data to ChromaDB in batches
    for i in range(0, len(paragraphs), batch_size):
        batch_end = min(i + batch_size, len(paragraphs))
        collection.add(
            documents=paragraphs[i:batch_end],
            embeddings=embeddings[i:batch_end],
            metadatas=metadatas[i:batch_end],
            ids=doc_ids[i:batch_end]
        )
        if (i + batch_size) % 5000 == 0:
            print(f"Processed {i + batch_size} entries...")
    
    print(f"Successfully added {len(paragraphs)} paragraphs to ChromaDB!")

In [125]:
client = chromadb.PersistentClient(path="./chroma_storage")
collection = client.get_or_create_collection("paragraph_collection")
prepare_and_insert_data(paragraphs, collection)

Add of existing embedding ID: doc_0
Add of existing embedding ID: doc_1
Add of existing embedding ID: doc_2
Add of existing embedding ID: doc_3
Add of existing embedding ID: doc_4
Add of existing embedding ID: doc_5
Add of existing embedding ID: doc_6
Add of existing embedding ID: doc_7
Add of existing embedding ID: doc_8
Add of existing embedding ID: doc_9
Add of existing embedding ID: doc_10
Add of existing embedding ID: doc_11
Add of existing embedding ID: doc_12
Add of existing embedding ID: doc_13
Add of existing embedding ID: doc_14
Add of existing embedding ID: doc_15
Add of existing embedding ID: doc_16
Add of existing embedding ID: doc_17
Add of existing embedding ID: doc_18
Add of existing embedding ID: doc_19
Add of existing embedding ID: doc_20
Add of existing embedding ID: doc_21
Add of existing embedding ID: doc_22
Add of existing embedding ID: doc_23
Add of existing embedding ID: doc_24
Add of existing embedding ID: doc_25
Add of existing embedding ID: doc_26
Add of exis

Verification counts:
Paragraphs: 15555
Embeddings: 15555
IDs: 15555
Metadata: 15555


Add of existing embedding ID: doc_296
Add of existing embedding ID: doc_297
Add of existing embedding ID: doc_298
Add of existing embedding ID: doc_299
Add of existing embedding ID: doc_300
Add of existing embedding ID: doc_301
Add of existing embedding ID: doc_302
Add of existing embedding ID: doc_303
Add of existing embedding ID: doc_304
Add of existing embedding ID: doc_305
Add of existing embedding ID: doc_306
Add of existing embedding ID: doc_307
Add of existing embedding ID: doc_308
Add of existing embedding ID: doc_309
Add of existing embedding ID: doc_310
Add of existing embedding ID: doc_311
Add of existing embedding ID: doc_312
Add of existing embedding ID: doc_313
Add of existing embedding ID: doc_314
Add of existing embedding ID: doc_315
Add of existing embedding ID: doc_316
Add of existing embedding ID: doc_317
Add of existing embedding ID: doc_318
Add of existing embedding ID: doc_319
Add of existing embedding ID: doc_320
Add of existing embedding ID: doc_321
Add of exist

Processed 5000 entries...


Add of existing embedding ID: doc_5178
Add of existing embedding ID: doc_5179
Add of existing embedding ID: doc_5180
Add of existing embedding ID: doc_5181
Add of existing embedding ID: doc_5182
Add of existing embedding ID: doc_5183
Add of existing embedding ID: doc_5184
Add of existing embedding ID: doc_5185
Add of existing embedding ID: doc_5186
Add of existing embedding ID: doc_5187
Add of existing embedding ID: doc_5188
Add of existing embedding ID: doc_5189
Add of existing embedding ID: doc_5190
Add of existing embedding ID: doc_5191
Add of existing embedding ID: doc_5192
Add of existing embedding ID: doc_5193
Add of existing embedding ID: doc_5194
Add of existing embedding ID: doc_5195
Add of existing embedding ID: doc_5196
Add of existing embedding ID: doc_5197
Add of existing embedding ID: doc_5198
Add of existing embedding ID: doc_5199
Add of existing embedding ID: doc_5200
Add of existing embedding ID: doc_5201
Add of existing embedding ID: doc_5202
Add of existing embedding

Processed 10000 entries...


Add of existing embedding ID: doc_10098
Add of existing embedding ID: doc_10099
Add of existing embedding ID: doc_10100
Add of existing embedding ID: doc_10101
Add of existing embedding ID: doc_10102
Add of existing embedding ID: doc_10103
Add of existing embedding ID: doc_10104
Add of existing embedding ID: doc_10105
Add of existing embedding ID: doc_10106
Add of existing embedding ID: doc_10107
Add of existing embedding ID: doc_10108
Add of existing embedding ID: doc_10109
Add of existing embedding ID: doc_10110
Add of existing embedding ID: doc_10111
Add of existing embedding ID: doc_10112
Add of existing embedding ID: doc_10113
Add of existing embedding ID: doc_10114
Add of existing embedding ID: doc_10115
Add of existing embedding ID: doc_10116
Add of existing embedding ID: doc_10117
Add of existing embedding ID: doc_10118
Add of existing embedding ID: doc_10119
Add of existing embedding ID: doc_10120
Add of existing embedding ID: doc_10121
Add of existing embedding ID: doc_10122


Processed 15000 entries...


Add of existing embedding ID: doc_15293
Add of existing embedding ID: doc_15294
Add of existing embedding ID: doc_15295
Add of existing embedding ID: doc_15296
Add of existing embedding ID: doc_15297
Add of existing embedding ID: doc_15298
Add of existing embedding ID: doc_15299
Add of existing embedding ID: doc_15300
Add of existing embedding ID: doc_15301
Add of existing embedding ID: doc_15302
Add of existing embedding ID: doc_15303
Add of existing embedding ID: doc_15304
Add of existing embedding ID: doc_15305
Add of existing embedding ID: doc_15306
Add of existing embedding ID: doc_15307
Add of existing embedding ID: doc_15308
Add of existing embedding ID: doc_15309
Add of existing embedding ID: doc_15310
Add of existing embedding ID: doc_15311
Add of existing embedding ID: doc_15312
Add of existing embedding ID: doc_15313
Add of existing embedding ID: doc_15314
Add of existing embedding ID: doc_15315
Add of existing embedding ID: doc_15316
Add of existing embedding ID: doc_15317


Successfully added 15555 paragraphs to ChromaDB!


In [163]:
def search_query(
    query: str,
    collection: chromadb.Collection,
    model: SentenceTransformer,
    top_k: int = 3,
    threshold: float = 0.05,
    return_raw: bool = False
) -> Union[None, List[Tuple[str, float]]]:
    """
    Search for relevant documents in ChromaDB collection.
    
    Args:
        query: Search query string
        collection: ChromaDB collection to search in
        model: SentenceTransformer model instance
        top_k: Number of results to return
        threshold: Minimum similarity score (0-1) to include in results
        return_raw: If True, returns raw results instead of printing
        
    Returns:
        If return_raw is True, returns list of (document, score) tuples
        If return_raw is False, prints results and returns None
    """
    try:
        # Generate embedding for the query
        query_embedding = model.encode([query]).tolist()
        
        # Perform search in ChromaDB
        results = collection.query(
            query_embeddings=query_embedding,
            n_results=top_k,
            include=["documents", "distances", "metadatas"]
        )
        
        # Process results
        documents = results['documents'][0]
        distances = results['distances'][0]
        metadatas = results['metadata'][0] if 'metadata' in results else [{}] * len(documents)
        
        # Convert distance to similarity score (1 - distance)
        scores = [1 - dist for dist in distances]
        
        # Filter results by threshold
        filtered_results = [
            (doc, score, meta) 
            for doc, score, meta in zip(documents, scores, metadatas)
            if score >= threshold
        ]
        
        if return_raw:
            return filtered_results
        
        # Display results
        if not filtered_results:
            print("\nNo results found matching your query.")
            return None
            
        print(f"\nTop {len(filtered_results)} relevant paragraphs for query: '{query}'")
        print("-" * 80)
        
        for i, (doc, score, meta) in enumerate(filtered_results, 1):
            print(f"\n{i}. Similarity Score: {score:.4f}")
            if meta:
                print(f"Metadata: {meta}")
            print(f"Paragraph: {doc}")
            print("-" * 80)
            
    except Exception as e:
        print(f"Error performing search: {str(e)}")
        if return_raw:
            return []
    
    return None

In [164]:
# Initialize model and collection first
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
client = chromadb.PersistentClient(path="./chroma_storage")
collection = client.get_collection("paragraph_collection")

In [143]:
# Simple search
# r1 = search_query("How did relations between Greece and Turkey improve in 1999?", collection, model)

# # Search with custom parameters
# search_query(
#     "your search query",
#     collection,
#     model,
#     top_k=5,
#     threshold=0.5
# )

# Get raw results for further processing
results = search_query(
    "How many different types of iPod are currently available?",
    collection,
    embedding_model,
    return_raw=True
)

In [144]:
if results:
    # Process results
    for doc, score, meta in results:
        print(f"Score: {score:.4f}")
        print(f"Metadata: {meta}")
        print(f"Document: {doc}")

# Making the final pipeline

In [165]:
@dataclass
class QAResult:
    """Store QA model results with metadata"""
    answer: str
    confidence: float
    context: str
    model_name: str
    processing_time: float

class QASystem:
    def __init__(self, collection, embedding_model):
        self.collection = collection
        self.embedding_model = embedding_model
        self.models = {}
        self.initialize_models()
    
    def initialize_models(self):
        """Initialize QA model on CPU to avoid memory issues"""
        try:
            print("Loading RoBERTa model on CPU...")
            self.models['roberta'] = pipeline(
                'question-answering',
                model='deepset/roberta-base-squad2',
                device=-1  # Force CPU usage
            )
            print("Successfully loaded RoBERTa model")
        except Exception as e:
            print(f"Error loading model: {str(e)}")

    def process_search_results(
        self,
        results: List[Tuple[str, float, Dict]],
        min_score: float = 0.05
    ) -> str:
        """Process and combine search results into context."""
        # Sort results by descending score
        results = sorted(results, key=lambda x: x[1], reverse=True)

        # Filter by minimum score
        relevant_docs = [doc for doc, score, _ in results if score >= min_score]

        # Return concatenated documents or None if no relevant docs
        return " ".join(relevant_docs) if relevant_docs else ""


    def answer_question(
        self,
        question: str,
        context: str,
        model_name: str = 'roberta'
    ) -> Optional[QAResult]:
        """Generate answer using the model"""
        if not self.models:
            print("No models available.")
            return None
            
        start_time = time.time()
        
        try:
            result = self.models[model_name](
                question=question,
                context=context
            )
            
            processing_time = time.time() - start_time
            
            return QAResult(
                answer=result['answer'],
                confidence=result['score'],
                context=context,
                model_name=model_name,
                processing_time=processing_time
            )
        except Exception as e:
            print(f"Error generating answer: {str(e)}")
            return None
        
    def rephrase_answer(self, question: str, answer: str) -> str:
        """Rephrase the answer into a complete sentence."""
        if "year" in question.lower():
            return f"The year when {question[3:].strip().lower()} is {answer}."
        elif "how" in question.lower():
            return f"The way {question[4:].strip().lower()} happened is because {answer}."
        elif "why" in question.lower():
            return f"The reason {question[4:].strip().lower()} is that {answer}."
        else:
            return f"The answer to '{question}' is: {answer}."


def get_answer(
    question: str,
    search_results: List[Tuple[str, float, Dict]],
    collection,
    embedding_model,
    min_score: float = 0.05
) -> Optional[str]:
    """Complete pipeline to get answer from search results"""
    # Initialize QA system
    qa_system = QASystem(collection, embedding_model)
    
    # Process search results into context
    context = qa_system.process_search_results(search_results, min_score)
    if not context:
        print("No relevant context found with sufficient confidence score.")
        return None
    
    # Get answer
    result = qa_system.answer_question(question, context)
    if result.confidence < 0.05:
        print("Confidence score is below 0.5, consider verifying the answer.")
        return None
    
    if result:
        print("\nQuestion:", question)
        print("\nAnswer:", result.answer)
        print("Confidence:", f"{result.confidence:.4f}")
        print("Processing Time:", f"{result.processing_time:.2f}s")
        print("\nRelevant Context Used:")
        print("-" * 80)
        print(context)
        return result.answer
    return None

In [166]:
context = search_query(
    "in what year was the original iPod released?",
    collection,
    embedding_model,
    return_raw=True,
    top_k=3
)

# Get answer
answer = get_answer(
    "in what year was the original iPod released?",
    context,
    collection,
    embedding_model
)

Loading RoBERTa model on CPU...


Device set to use cpu


Successfully loaded RoBERTa model

Question: in what year was the original iPod released?

Answer: 2001
Confidence: 0.2147
Processing Time: 0.15s

Relevant Context Used:
--------------------------------------------------------------------------------
In early 2001, Apple began shipping computers with CD-RW drives and emphasized the Mac's ability to play DVDs by including DVD-ROM and DVD-RAM drives as standard. Steve Jobs admitted that Apple had been "late to the party" on writable CD technology, but felt that Macs could become a "digital hub" that linked and enabled an "emerging digital lifestyle". Apple would later introduce an update to its iTunes music player software that enabled it to burn CDs, along with a controversial "Rip, Mix, Burn" advertising campaign that some felt encouraged media piracy. This accompanied the release of the iPod, Apple's first successful handheld device. Apple continued to launch products, such as the unsuccessful Power Mac G4 Cube, the education-oriented

# Data Ingestion