In [23]:
# Part 1: Building the RAG Model

In [24]:
# Import necessary libraries
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone, ServerlessSpec
from cohere import Client
import pdfplumber
import streamlit as st
import cohere

In [29]:
# Prompt user to upload a PDF file
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

# Ensure there's an uploaded file before proceeding
if uploaded_file is not None:
    # Use pdfplumber to extract text from the uploaded PDF
    def extract_text_from_pdf(uploaded_file):
        with pdfplumber.open(uploaded_file) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"  # Extract text from each page
        return text

    # Extract and display the text from the PDF
    document_text = extract_text_from_pdf(uploaded_file)



In [30]:
# Generate Document Embeddings
# create embeddings from the text extracted from the PDF
from sentence_transformers import SentenceTransformer

# Load pre-trained transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate sentence-level embeddings for the text
sentences = document_text.split(".")  # Split text into sentences
sentence_embeddings = model.encode(sentences)

# Example usage: print one sentence and its corresponding embedding
print(sentences[0])
print(sentence_embeddings[0])

Chapter 8 Biology is the youngest of the formalised disciplines of natural
Human Health and Disease science
[ 1.00957649e-03  4.38434370e-02  7.38708116e-03  1.76462550e-02
 -1.16258757e-02  5.03440313e-02 -8.39617550e-02  1.39322951e-01
 -6.99806809e-02  8.96712393e-02 -1.97667554e-02  1.04239595e-03
 -6.46360591e-02  4.88311462e-02 -7.83217326e-02 -1.49432011e-02
 -1.17612518e-01  6.37972448e-03 -4.76321280e-02  1.72314160e-02
 -3.37933600e-02  1.48807392e-01  1.04132853e-01  1.52420951e-02
 -3.35123464e-02 -3.20260897e-02  3.42959613e-02 -8.30849335e-02
 -5.41196801e-02 -1.25355367e-03  2.09651329e-02  5.59218600e-02
  6.72592893e-02 -1.73090771e-02 -2.53563710e-02  4.94369827e-02
  9.40992013e-02 -4.94890511e-02  1.58186257e-02  5.40051013e-02
 -3.85849811e-02 -6.23704679e-02 -4.09939699e-02  1.22355148e-02
  3.30831818e-02  1.14591187e-02  6.31134259e-03 -2.63138991e-02
  2.24546436e-02 -9.72576626e-03 -3.76431532e-02 -4.18561697e-02
 -8.86348560e-02  5.94508126e-02  3.35745588e-0

In [31]:
# Set your Pinecone API key (ensure to replace with your actual API key)
api_key = '3df05ff6-0236-4da1-b8f5-f803a6d00eb1'

# Initialize the Pinecone instance
pc = Pinecone(api_key=api_key)

# Define the index name and embedding dimension
index_name = 'document-embeddings'
embedding_dim = sentence_embeddings.shape[1]  # Ensure sentence_embeddings is already defined

# Check if the index already exists, if not, create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric='euclidean',  # You can adjust the metric based on your needs
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'  # You can specify your region
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Upload the embeddings to the index
for i, embedding in enumerate(sentence_embeddings):
    index.upsert([(f'sentence-{i}', embedding)])

print("Embeddings successfully uploaded to Pinecone!")

Embeddings successfully uploaded to Pinecone!


In [32]:
import numpy as np
from sentence_transformers import util

# Function to find the most similar question
def get_most_relevant_sentence(query, model, sentences, sentence_embeddings):
    # Generate embedding for the query
    query_embedding = model.encode(query)

    # Calculate cosine similarity with each sentence embedding
    similarities = util.cos_sim(query_embedding, sentence_embeddings)

    # Get the index of the most similar sentence
    best_idx = np.argmax(similarities)

    return sentences[best_idx]

In [33]:
# Example usage
user_query = "What are pathogens"
most_relevant_sentence = get_most_relevant_sentence(user_query, model, sentences, sentence_embeddings)
print(f"Most relevant sentence: {most_relevant_sentence}")

Most relevant sentence:  Such disease-
causing organisms are called pathogens


In [34]:
# Cohere API for Generating a Response
# To generate more comprehensive answers, use a generative model like Cohere.
# Initialize the Cohere client
co = cohere.Client('eiVpA3k8j4oQXtOqRsHGWaW3tLHJqcaoMlkdGfG8')

def generate_answer(context, query):
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"

    # Generate a response using Cohere
    response = co.generate(
        model='command-xlarge-nightly',
        prompt=prompt,
        max_tokens=100
    )
    return response.generations[0].text

# Example usage
context = most_relevant_sentence
answer = generate_answer(context, user_query)
print(f"Generated answer: {answer}")

Generated answer: Disease-causing organisms are called pathogens.


In [35]:
# Part 2: Interactive QA Bot Interface

In [36]:
# Interactive Interface Using Streamlit
# Build Frontend Using Streamlit
# Create an interface where users can upload PDFs and ask questions
# Import the libraries
import streamlit as st
from sentence_transformers import SentenceTransformer
import pdfplumber
import pinecone
import cohere

# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize Pinecone
pinecone_client = pinecone.Pinecone(api_key='3df05ff6-0236-4da1-b8f5-f803a6d00eb1')  # Replace with your Pinecone API key
index = pinecone_client.Index('document-embeddings')

# Initialize Cohere
co = cohere.Client('eiVpA3k8j4oQXtOqRsHGWaW3tLHJqcaoMlkdGfG8')  # Replace with your Cohere API key

# Extract text from PDF
def extract_text_from_pdf(file):
    with pdfplumber.open(file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Streamlit App
st.title("QA Bot: Ask Questions from Your PDF")
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    document_text = extract_text_from_pdf(uploaded_file)
    st.write("Document uploaded successfully!")

    # Process document for embeddings
    sentences = document_text.split(".")
    sentence_embeddings = model.encode(sentences)

    # Handle user queries
    user_query = st.text_input("Ask a question:")

    if user_query:
        # Find the most relevant sentence
        relevant_sentence = get_most_relevant_sentence(user_query, model, sentences, sentence_embeddings)
        st.write(f"Relevant sentence: {relevant_sentence}")

        # Generate a full answer using Cohere
        answer = generate_answer(relevant_sentence, user_query)
        st.write(f"Answer: {answer}")



In [37]:
# Add multiple query handling
user_query = st.text_input("Ask a question:")

if user_query:
    query_embedding = model.encode(user_query)
    relevant_docs = retrieve_relevant_docs(query_embedding)
    context = ' '.join([doc['metadata']['text'] for doc in relevant_docs['matches']])

    # Generate answer using Cohere
    answer = generate_answer(context, user_query)
    st.write(f"Answer: {answer}")

    # Show relevant document segments
    st.write("Relevant document segments:")
    for doc in relevant_docs['matches']:
        st.write(doc['metadata']['text'])

2024-10-19 12:19:19.781 Session state does not function when running a script without `streamlit run`
