<a href="https://colab.research.google.com/github/Azad-mosarof/2D-Game-Using-Java-Script/blob/main/ayurveda_llm_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf
!pip install asyncio==3.4.3 asyncpg==0.27.0 cloud-sql-python-connector["asyncpg"]==1.2.3
!pip install numpy==1.22.4 pandas==1.5.3
!pip install pgvector==0.1.8
!pip install langchain==0.0.196 transformers==4.30.1
!pip install google-cloud-aiplatform==1.26.0

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
project_id = ""  # @param {type:"string"}
database_password = ""  # @param {type:"string"}
region = "us-west2"  # @param {type:"string"}
instance_name = "demo-pg-vector"  # @param {type:"string"}
database_name = "ayurveda-chat"  # @param {type:"string"}
database_user = "ayurveda-admin"  # @param {type:"string"}


# Quick input validations.
assert project_id, "⚠️ Please provide a Google Cloud project ID"
assert region, "⚠️ Please provide a Google Cloud region"
assert instance_name, "⚠️ Please provide the name of your instance"
assert database_name, "⚠️ Please provide a database name"
assert database_user, "⚠️ Please provide a database user"
assert database_password, "⚠️ Please provide a database password"

In [None]:
#@markdown ###Authenticate your Google Cloud Account and enable APIs.
# Authenticate gcloud.
from google.colab import auth
auth.authenticate_user()

# Configure gcloud.
!gcloud config set project {project_id}

# Grant Cloud SQL Client role to authenticated user
current_user = !gcloud auth list --filter=status:ACTIVE --format="value(account)"

!gcloud projects add-iam-policy-binding {project_id} \
  --member=user:{current_user[0]} \
  --role="roles/cloudsql.client"


# Enable Cloud SQL Admin API
!gcloud services enable sqladmin.googleapis.com
!gcloud services enable aiplatform.googleapis.com

In [None]:
#@markdown Create and setup a Cloud SQL PostgreSQL instance, if not done already.
database_version = !gcloud sql instances describe {instance_name} --format="value(databaseVersion)"
if database_version[0].startswith("POSTGRES"):
  print("Found an existing Postgres Cloud SQL Instance!")
else:
  print("Creating new Cloud SQL instance...")
  !gcloud sql instances create {instance_name} --database-version=POSTGRES_15 \
    --region={region} --cpu=1 --memory=4GB --root-password={database_password}

# Create the database, if it does not exist.
out = !gcloud sql databases list --instance={instance_name} --filter="NAME:{database_name}" --format="value(NAME)"
if ''.join(out) == database_name:
  print("Database %s already exists, skipping creation." % database_name)
else:
  !gcloud sql databases create {database_name} --instance={instance_name}

# Create the database user for accessing the database.
!gcloud sql users create {database_user} \
  --instance={instance_name} \
  --password={database_password}

Creating new Cloud SQL instance...
Created [https://sqladmin.googleapis.com/sql/v1beta4/projects/scms-face-recognition/instances/demo-pg-vector].
NAME            DATABASE_VERSION  LOCATION    TIER              PRIMARY_ADDRESS  PRIVATE_ADDRESS  STATUS
demo-pg-vector  POSTGRES_15       us-west2-a  db-custom-1-4096  35.236.64.23     -                RUNNABLE
Created database [retail].
instance: demo-pg-vector
name: retail
project: scms-face-recognition
Created user [retail-admin].


In [None]:
# @markdown Verify that you are able to connect to the database. Executing this block should print the current PostgreSQL server version.

import asyncio
import asyncpg
from google.cloud.sql.connector import Connector


async def main():
    # get current running event loop to be used with Connector
    loop = asyncio.get_running_loop()
    # initialize Connector object as async context manager
    async with Connector(loop=loop) as connector:
        # create connection to Cloud SQL database
        conn: asyncpg.Connection = await connector.connect_async(
            f"{project_id}:{region}:{instance_name}",  # Cloud SQL instance connection name
            "asyncpg",
            user=f"{database_user}",
            password=f"{database_password}",
            db=f"{database_name}"
            # ... additional database driver args
        )

        # query Cloud SQL database
        results = await conn.fetch("SELECT version()")
        print(results[0]["version"])

        # close asyncpg connection
        await conn.close()


# Test connection with `asyncio`
await main()  # type: ignore

PostgreSQL 15.4 on x86_64-pc-linux-gnu, compiled by Debian clang version 12.0.1, 64-bit


In [None]:
import fitz

def read_pdf(file_path):
    pdf_document = fitz.open(file_path)
    text_chunks = []

    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        text = page.get_text()
        text_chunks.extend(text.split())

    pdf_document.close()
    return text_chunks


def clean_text(text_chunks):
    cleaned_text = []
    for chunk in text_chunks:
        cleaned_chunk = chunk.replace('�', '').strip()
        if cleaned_chunk:
            cleaned_text.append(cleaned_chunk)

    return cleaned_text


def chunk_text_by_words(text, target_words):
    chunks = []
    current_chunk = []

    for word in text:
        current_chunk.append(word)

        if len(current_chunk) >= target_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

pdf_file_path = 'data.pdf'
words = read_pdf(pdf_file_path)

cleaned_words = clean_text(words)

target_words = 500
word_chunks_500 = chunk_text_by_words(cleaned_words, target_words)

# for i, chunk in enumerate(word_chunks_500):
#     print(f"Chunk {i + 1}:\n{chunk}\n")

len(word_chunks_500)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n"],
    chunk_size=500,
    chunk_overlap=0,
    length_function=len,
)
chunks = []
for row in word_chunks_500:
    splits = text_splitter.create_documents([row])
    for s in splits:
        r = {"content": s.page_content}
        chunks.append(r)

In [None]:
len(chunks)

174

In [None]:
from langchain.embeddings import VertexAIEmbeddings
from google.cloud import aiplatform
import time

# Initialize AI Platform
aiplatform.init(project=f"{project_id}", location=f"{region}")
embeddings_service = VertexAIEmbeddings()

# Helper function to retry failed API requests with exponential backoff
def retry_with_backoff(func, *args, retry_delay=5, backoff_factor=2, **kwargs):
    max_attempts = 10
    retries = 0
    for i in range(max_attempts):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            print(f"error: {e}")
            retries += 1
            wait = retry_delay * (backoff_factor**retries)
            print(f"Retry after waiting for {wait} seconds...")
            time.sleep(wait)

# Generate vector embeddings for each chunk
batch_size = 5
for i in range(0, len(chunks), batch_size):
    request = chunks[i : i + batch_size]
    response = retry_with_backoff(embeddings_service.embed_documents, request)
    # Store the retrieved vector embeddings for each chunk back.
    for chunk, embedding in zip(chunks[i : i + batch_size], response):
        print(f"Chunk: {chunk}, Embedding: {embedding}")
        chunk["embedding"] = embedding


In [None]:
import pandas as pd
product_embeddings = pd.DataFrame(chunks)
product_embeddings.iloc[173]

content      information for Ayurvedic practitioners mahari...
embedding    [-0.01698254607617855, -0.035865772515535355, ...
Name: 173, dtype: object

In [None]:
# Store the generated vector embeddings in a PostgreSQL table.
# This code may run for a few minutes.

import asyncio
import asyncpg
from google.cloud.sql.connector import Connector
import numpy as np
from pgvector.asyncpg import register_vector


async def main():
    loop = asyncio.get_running_loop()
    async with Connector(loop=loop) as connector:
        # Create connection to Cloud SQL database.
        conn: asyncpg.Connection = await connector.connect_async(
            f"{project_id}:{region}:{instance_name}",
            "asyncpg",
            user=f"{database_user}",
            password=f"{database_password}",
            db=f"{database_name}",
        )

        await conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
        await register_vector(conn)

        await conn.execute("DROP TABLE IF EXISTS product_embeddings")
        # Create the `content_embeddings` table to store vector embeddings.
        await conn.execute(
            """CREATE TABLE content_embeddings(
                                content TEXT,
                                embedding vector(768))"""
        )

        # Store all the generated embeddings back into the database.
        for index, row in product_embeddings.iterrows():
            await conn.execute(
                "INSERT INTO content_embeddings (content, embedding) VALUES ($1, $2)",
                row["content"],
                np.ravel(row["embedding"])
            )

        await conn.close()

await main()  # type: ignore

In [None]:
from pgvector.asyncpg import register_vector
import asyncio
import asyncpg
from google.cloud.sql.connector import Connector
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.llms import VertexAI
from langchain import PromptTemplate, LLMChain
from IPython.display import display, Markdown
from langchain.embeddings import VertexAIEmbeddings
from google.cloud import aiplatform

# @markdown Enter a short description of the toy to search for within a specified price range:
question = "why ayurveda is better than modern medicine?"  # @param {type:"string"}
assert question, "⚠️ Please input a valid input question text"

aiplatform.init(project=f"{project_id}", location=f"{region}")

embeddings_service = VertexAIEmbeddings()
qe = embeddings_service.embed_query([question])

llm = VertexAI()

map_prompt_template = """
              You will be given a detailed description of an health issue.
              This description is enclosed in triple backticks (```).
              Using this description only, extract the name of the disease,
              the price of the toy and its features.

              ```{text}```
              SUMMARY:
              """

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

combine_prompt_template = """
                You will be given many detailed information about an health issue
                enclosed in triple backticks (```) and a question enclosed in
                double backticks(``).
                Select one  that is most relevant to answer the question.
                Using that selected toy description, answer the following
                question in as much detail as possible.
                You should only use the information in the description.
                Your answer should include the name of the diease or problem of the user and how to treatement that problem
                through ayurveda. Your answer should be less than 300 words.
                Your answer should be in Markdown in a numbered list format.


                Description:
                ```{text}```


                Question:
                ``{user_query}``


                Answer:
                """

combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["text", "user_query"]
)

async def main():
    loop = asyncio.get_running_loop()
    async with Connector(loop=loop) as connector:
        # Create connection to Cloud SQL database.
        conn: asyncpg.Connection = await connector.connect_async(
            f"{project_id}:{region}:{instance_name}",  # Cloud SQL instance connection name
            "asyncpg",
            user=f"{database_user}",
            password=f"{database_password}",
            db=f"{database_name}",
        )

        await register_vector(conn)
        similarity_threshold = 0.1
        num_matches = 50

        # Find similar content to the query using cosine similarity search
        # over all vector embeddings. This new feature is provided by `pgvector`.
        # Perform vector similarity search
        results = await conn.fetch(
            """
            WITH vector_matches AS (
              SELECT content, 1 - (embedding <=> $1) AS similarity
              FROM content_embeddings
              WHERE 1 - (embedding <=> $1) > $2
              ORDER BY similarity DESC
              LIMIT $3
            )
            SELECT content FROM vector_matches
            """,
            qe,
            similarity_threshold,
            num_matches,
        )

        if len(results) == 0:
            raise Exception("Did not find any results. Adjust the query parameters.")

        # Collect the results
        matches = pd.DataFrame(results, columns=["content"])

        await conn.close()

    return matches


matches = await main()  # type: ignore
matches = pd.DataFrame(matches)
# matches.head(5)

docs = [Document(page_content=t) for t in matches]
chain = load_summarize_chain(
    llm, chain_type="map_reduce", map_prompt=map_prompt, combine_prompt=combine_prompt
)
answer = chain.run(
    {
        "input_documents": docs,
        "user_query": question,
    }
)


display(Markdown(answer))

 1.  Ayurveda is a holistic system of medicine that treats the whole person, not just the symptoms of the disease. 
2.  It focuses on preventing disease and promoting health, rather than just treating symptoms. 
3.  Ayurvedic treatments are typically less expensive than modern medical treatments. 
4.  Ayurvedic medicines are typically made from natural ingredients, which are less likely to cause side effects than synthetic drugs. 
5.  Finally, Ayurveda has been practiced for thousands of years and has a long history of safety and efficacy.

In [None]:
# @markdown Clean-up and delete the Cloud SQL instance.
!gcloud sql instances patch {instance_name} --no-deletion-protection
!gcloud sql instances delete {instance_name} --quiet

The following message will be used for the patch API method.
{"name": "demo-pg-vector", "project": "scms-face-recognition", "settings": {"deletionProtectionEnabled": false}}
Updated [https://sqladmin.googleapis.com/sql/v1beta4/projects/scms-face-recognition/instances/demo-pg-vector].
Deleted [https://sqladmin.googleapis.com/sql/v1beta4/projects/scms-face-recognition/instances/demo-pg-vector].
