In [47]:
!pip install --upgrade PyMuPDF google-generativeai gradio langchain faiss-cpu langchain-community langchain-google-genai sentence-transformers pinecone


Collecting google-generativeai
  Downloading google_generativeai-0.8.1-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.9 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.9-py3-none-any.whl.metadata (5.6 kB)


In [48]:
import fitz  # PyMuPDF
import gradio as gr
import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import tensorflow_hub as hub
import tensorflow as tf
import pinecone

In [51]:
from google.colab import userdata
gemini_key=userdata.get('gemini_key')
genai.configure(api_key=gemini_key)

pinecone_key=userdata.get('pinecone')



In [52]:
# 1. Extract Text from Multiple PDFs
def extract_text_from_pdfs(pdf_file):
    all_texts = []


    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()

    all_texts.append((pdf_file, text))


    return all_texts

In [53]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
text = extract_text_from_pdfs('/content/drive/My Drive/documents/hafiz pdf.pdf')
print(text)

[('/content/drive/My Drive/documents/hafiz pdf.pdf', '185\n© Springer International Publishing AG 2017 \nB.S. Chauhan et al. (eds.), Rice Production Worldwide, \nDOI\xa010.1007/978-3-319-47516-5_8\nChapter 8\nRice Production Systems\nA.N.\xa0Rao, S.P.\xa0Wani, M.S.\xa0Ramesha, and\xa0J.K.\xa0Ladha\n8.1\u2003 \x07Introduction\nRice is one of the major staple cereals with more than 3.5 billion people depending \non rice for more than 20\xa0% of their daily calorie intake (IRRI, Africa Rice and CIAT \n2010). It is estimated that the rice production must increase by 114 million tons by \n2035, but farmers must achieve it under significant threats from climate change \n(Suzanne et\xa0al. 2012) coupled with decreasing amount of available agricultural land, \nlabor, and water for agriculture and increased costs of all inputs. Increasing global \nfood production with minimal adverse impact on resources and the environment is \nthe greatest challenge for food security (Ladha et\xa0al. 2015). He

In [55]:
# def create_chunks(text, chunk_size=200):
#     chunks = []
#     for i in range(0, len(text), chunk_size):
#         chunks.append(text[i:i + chunk_size])
#     return chunks

def create_chunks(text):
    # Split text into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = splitter.split_text(text)
    return chunks



import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

def generate_embeddings(text_chunks):
    # Load the Universal Sentence Encoder model from TensorFlow Hub
    model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    print(type(model))

    # Convert text chunks to a TensorFlow constant tensor
    tf_sentence = tf.constant(text_chunks)

    # Generate embeddings
    embeddings = model(tf_sentence)
    return embeddings




In [56]:
# Main process
def process_pdfs_and_store_embeddings(pdf_file):
    # Extract text from PDFs
    all_texts = extract_text_from_pdfs(pdf_file)
    # print(all_texts)
    # print(len(all_texts))

    all_embeddings = []
    document_ids = []

    for pdf_file, text in all_texts:
        # Split the text into chunks
        text_chunks = create_chunks(text)
        print(text_chunks)
        print(len(text_chunks))

        # Generate embeddings for the chunks
        embeddings = generate_embeddings(text_chunks)
        all_embeddings.append(embeddings)
        document_ids.append(pdf_file)
    print('embeddings',all_embeddings)
    print('embeddings length',len(all_embeddings))
    print('IDS',document_ids)
    print('IDS lengt..',len(document_ids))
    print('text Chunks..', text_chunks)
    return all_embeddings,document_ids,text_chunks


In [57]:
embeddings,document_ids,text_chunks = process_pdfs_and_store_embeddings('/content/drive/My Drive/documents/Lab # 2s.pdf')

['Lab Manual \n        \n                     The University of Faisalabad \nET-364 Data and Computer Communication \n \nSchool of Engineering Technology \n \nLab Manual \n        \n                     The University of Faisalabad \nET-364 Data and Computer Communication \n \nSchool of Engineering Technology \nLab # 2 \nObjective: \n➢ To study different internetworking devices in computer networks \nDescription: \n1.0 REPEATER', '➢ To study different internetworking devices in computer networks \nDescription: \n1.0 REPEATER \n \nA Repeater is a purely electrical device that extends maximum distance a LAN cable can span by \nAmplifying signals passing through it. A Repeater connects two segments and broadcasts \npackets between them. Since signal loss is a factor in the maximum length of a segment, a \nRepeater is used to amplify the signal and extend the usable length. \n \nRepeaters \n1.1 BRIDGES', 'Repeater is used to amplify the signal and extend the usable length. \n \nRepeaters \

In [58]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

import math

def store_embeddings_in_pinecone(embeddings, text_chunks,document_ids, api_key, index_name):
    # Initialize Pinecone
    pc = Pinecone(api_key=api_key, environment='us-east-1')

    # Create index if it does not exist
    if index_name not in pc.list_indexes().names():
        print(f"Creating index '{index_name}'...")
        pc.create_index(name=index_name, dimension=512, metric="cosine",
                        spec=ServerlessSpec(
                            cloud='aws',
                            region='us-east-1'
                        ))

    # Connect to the index
    index = pc.Index(index_name)

    # Prepare IDs, flatten embeddings, and gather metadata
    ids = [f"{doc_id}_{i}" for doc_id, chunks in zip(document_ids, embeddings) for i in range(len(chunks))]

    flattened_embeddings = [embedding.numpy() for sublist in embeddings for embedding in sublist]



    # Debug: Print full text chunks to check if they are correct
    print("Sample text chunks:")
    for chunk in text_chunks[:5]:  # Adjust the slice as needed
        print(repr(chunk))  # Use repr to show the full content

    # Prepare the data with metadata
    vectors_with_metadata = []
    for id_, embedding, text_chunk in zip(ids, flattened_embeddings, text_chunks):
        vectors_with_metadata.append({
            'id': id_,
            'values': embedding.tolist(),
            'metadata': {'text': text_chunk}
        })

    # Batch the data to avoid exceeding the 1000 vectors per request limit
    batch_size = 100
    total_vectors = len(vectors_with_metadata)
    for i in range(0, total_vectors, batch_size):
        batch = vectors_with_metadata[i:i+batch_size]

        # Upsert the batch into Pinecone
        index.upsert(vectors=batch)
        print(f"Upserted batch {i // batch_size + 1} of {math.ceil(total_vectors / batch_size)}")

    return index


In [59]:
index = store_embeddings_in_pinecone(embeddings, text_chunks,document_ids, pinecone_key, "chatbot" )

Sample text chunks:
'Lab Manual \n        \n                     The University of Faisalabad \nET-364 Data and Computer Communication \n \nSchool of Engineering Technology \n \nLab Manual \n        \n                     The University of Faisalabad \nET-364 Data and Computer Communication \n \nSchool of Engineering Technology \nLab # 2 \nObjective: \n➢ To study different internetworking devices in computer networks \nDescription: \n1.0 REPEATER'
'➢ To study different internetworking devices in computer networks \nDescription: \n1.0 REPEATER \n \nA Repeater is a purely electrical device that extends maximum distance a LAN cable can span by \nAmplifying signals passing through it. A Repeater connects two segments and broadcasts \npackets between them. Since signal loss is a factor in the maximum length of a segment, a \nRepeater is used to amplify the signal and extend the usable length. \n \nRepeaters \n1.1 BRIDGES'
'Repeater is used to amplify the signal and extend the usable length.

In [60]:
def generate_question_embeddings(user_input):
    tf_sentence = tf.constant([user_input])  # Batch input as needed
    model = model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    embeddings = model(tf_sentence)
    # Example: If you need the first vector only
    return embeddings[0].numpy()  # Convert to NumPy array if needed


In [61]:


import numpy as np

def database_response(user_input):
    # Generate the query vector
    query_vector = generate_question_embeddings(user_input)
    query_vector = query_vector.flatten().tolist()

    # Ensure the query vector is a flat list of floats
    print(',,,,,,,,,,,,,,,,,,,,',query_vector)

    # Check for NaN or infinite values in the query vector
    if any(np.isnan(query_vector)) or any(np.isinf(query_vector)):
        raise ValueError("Query vector contains NaN or infinite values.")

    # Ensure the query vector has the correct dimension
    if len(query_vector) != 512:  # Adjust to the correct dimension
        raise ValueError(f"Query vector has incorrect dimensions. Expected 512, got {len(query_vector)}.")

    # Retrieve the top matches from Pinecone
    top_k = 3
    query_response = index.query(vector=query_vector, top_k=top_k, include_metadata=True)

    # Check if the response contains matches
    if 'matches' not in query_response:
        raise ValueError("No matches found in Pinecone query response.")

    # Extract the relevant text chunks from the response
    relevant_text_chunks = [match['metadata']['text'] for match in query_response['matches']]

    # Print or use the relevant text chunks
    for i, text_chunk in enumerate(relevant_text_chunks, 1):
        print(f"Text Chunk {i}...: {text_chunk}")

    return relevant_text_chunks


In [62]:
import textwrap
def answer_question(pdf_path, question):
    # Extract text from PDF
    embeddings,document_ids,text_chunks = process_pdfs_and_store_embeddings(pdf_path)

    # Create the chunks
   # chunks = create_chunks(text)

    # Generate embeddings
    #embeddings = generate_embeddings(chunks)

    # Create index
    index = store_embeddings_in_pinecone(embeddings, text_chunks,document_ids, pinecone_key, "chatbot")

    # Find the most relevant text chunks
    result = database_response(question)

    # Get the content of the most relevant chunk

    model = genai.GenerativeModel(model_name="gemini-1.5-flash")
    # Query the Gemini-1.5-Flash model
    context_text = "\n".join([f"- {chunk}" for chunk in result])
    prompt = textwrap.dedent(f"""
        Use the following pieces of context to answer the question at the end. If the answer to the question is found in the context, provide it and explain the main terminology. If the question is not directly related to the provided context, generate an answer using relevant knowledge and state that the question is beyond the context of the given material.  Always say "thanks for asking!" at the end of the answer.

    Context :{context_text}
    Question : {question}

    """)

    response = model.generate_content(
        prompt
    )




    return response.text


In [63]:
def gradio_interface(pdf_file, question):
    answer = answer_question(pdf_file, question)
    return answer

In [64]:
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
    outputs=gr.Textbox(label="Answer"),
    title="StudyBuddy",
    description="Upload a PDF file and ask questions related to its content."
)

In [None]:
iface.launch(debug=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://fa63809b10801fb93f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
