# Test RAG app
this notebook is to test the flow of the CDC rag application 

#### Step1: Use the LLM in vertex AI to get the target decease name  

In [1]:
import vertexai
from vertexai.language_models import TextGenerationModel
from transformers import AutoTokenizer, AutoModel
import torch
from google.cloud import bigquery
import configparser

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = configparser.ConfigParser()
config.read('configuration.properties')

['configuration.properties']

In [3]:
# vertex AI config
project = config['VertexAI']['project']
endpoint_id = config['VertexAI']['endpoint_id']
location = config['VertexAI']['location']
api_endpoint = config['VertexAI']['api_endpoint']
model_pretrained = config['VertexAI']['model_pretrained']

In [4]:
vertexai.init(project=project, location=location)

In [5]:
parameters = {
  "max_output_tokens": 1024,
  "temperature": 0.9,
  "top_p": 1
}

In [6]:
model_llm = TextGenerationModel.from_pretrained(model_pretrained)

In [7]:
model_llm = model_llm.get_tuned_model(f"projects/{project}/locations/{location}/models/{endpoint_id}")

In [8]:
question = "I am looking to treatment for Stroke. highlighting key clinical trials, regulatory and reimbursement insights to consider and emerging competitor landscapes."

In [9]:
response = model_llm.predict(
  f""" {question} what disease is being targeted? Return only the disease name as a single word.""",
  **parameters
)
print(f"{response.text}")

 Stroke


In [10]:
name = response.text

### Step2: Embed the name and search for similar content in Big Query

In [11]:
# Tokenizer config
model_name = config['Tokenizer']['model_name']

In [12]:
tokenizer_embed = AutoTokenizer.from_pretrained(model_name)
model_embed = AutoModel.from_pretrained(model_name)

In [13]:
def generateEmbedding(text):
  inputs = tokenizer_embed(text, padding=True, truncation=True, return_tensors="pt")
  with torch.no_grad():
    outputs = model_embed(**inputs)
  embeddings = outputs.last_hidden_state.mean(dim=1)
  embeddings_list = embeddings.tolist()
  return embeddings_list[0]

In [14]:
embedding = generateEmbedding(name)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
# Initialize BigQuery client
bq_client = bigquery.Client()

In [17]:
project_id = config['BigQuery']['project_id']
dataset_id = config['BigQuery']['dataset_id']
table_id = config['BigQuery']['table_id']

In [49]:
query = f"""
WITH query AS (
  SELECT {embedding} AS query_vector
),
similarity AS (
  SELECT
    title,
    link,
    dataSetCode,
    dataSetType,
    description,
    updateDate,
    keywords,
    reportLocation,
    embedding,
    (SELECT SUM(qv * ev)
      FROM UNNEST(query_vector) AS qv WITH OFFSET AS q_idx
      JOIN UNNEST(embedding) AS ev WITH OFFSET AS e_idx
      ON q_idx = e_idx) AS dot_product,
    SQRT((SELECT SUM(POW(qv, 2))
          FROM UNNEST(query_vector) AS qv)) AS query_magnitude,
    SQRT((SELECT SUM(POW(ev, 2))
          FROM UNNEST(embedding) AS ev)) AS embedding_magnitude
  FROM
    `{project_id}.{dataset_id}.{table_id}`, query
),
final AS (
  SELECT
    title,
    link,
    dataSetCode,
    dataSetType,
    description,
    updateDate,
    keywords,
    reportLocation,
    dot_product / (query_magnitude * embedding_magnitude) AS cosine_similarity
  FROM
    similarity
)
SELECT
  title,
  link,
  dataSetCode,
  dataSetType,
  description,
  updateDate,
  keywords,
  reportLocation,
  cosine_similarity
FROM
  final
ORDER BY
  cosine_similarity DESC
LIMIT 3;
"""

In [50]:
query_job = bq_client.query(query)

In [51]:
results = query_job.result()

In [52]:
similar_data = {}
for row in results:
  if row.cosine_similarity >= 0.85:
    similar_data[row.dataSetCode] = row
  print(f"{row.dataSetCode}, Cosine Similarity: {row.cosine_similarity}")

kgsi-35re, Cosine Similarity: 1.0
kpwh-eddm, Cosine Similarity: 1.0
dhsy-4sea, Cosine Similarity: 1.0


In [54]:
if len(similar_data) == 0:
  print('no relevant data')
else:
  print(similar_data)

{'kgsi-35re': Row(('Stroke Mortality Data Among US Adults (35+) by State/Territory and County', 'https://data.cdc.gov/Heart-Disease-Stroke-Prevention/Stroke-Mortality-Data-Among-US-Adults-35-by-State-/kgsi-35re', 'kgsi-35re', 'Dataset', '2014 to 2016, 3-year average. Rates are age-standardized. County rates are spatially smoothed. The data can be viewed by gender and race/ethnicity. Data source: National Vital Statistics System. Additional data, maps, and methodology can be viewed on the Interactive Atlas of Heart Disease and Stroke http://www.cdc.gov/dhdsp/maps/atlas', '24/08/2023', 'Stroke', 'reports/kgsi-35re.html', 1.0), {'title': 0, 'link': 1, 'dataSetCode': 2, 'dataSetType': 3, 'description': 4, 'updateDate': 5, 'keywords': 6, 'reportLocation': 7, 'cosine_similarity': 8}), 'kpwh-eddm': Row(('Stroke Mortality Data Among US Adults (35+) by State/Territory and County', 'https://data.cdc.gov/Heart-Disease-Stroke-Prevention/Stroke-Mortality-Data-Among-US-Adults-35-by-State-/kpwh-eddm'

### Step3: RAG application data

In [82]:
description = ""
links = []
reports = []
title = []
for id, row in similar_data.items():
  description += "**title:** " + row.title + " **description:** " + row.description + " \n "
  links.append(row.link)
  reports.append(row.reportLocation)
  title.append(row.title)

In [75]:
description

'**title:** Stroke Mortality Data Among US Adults (35+) by State/Territory and County **description:** 2014 to 2016, 3-year average. Rates are age-standardized. County rates are spatially smoothed. The data can be viewed by gender and race/ethnicity. Data source: National Vital Statistics System. Additional data, maps, and methodology can be viewed on the Interactive Atlas of Heart Disease and Stroke http://www.cdc.gov/dhdsp/maps/atlas \n **title:** Stroke Mortality Data Among US Adults (35+) by State/Territory and County **description:** 2013 to 2015, 3-year average. Rates are age-standardized. County rates are spatially smoothed. The data can be viewed by gender and race/ethnicity. Data source: National Vital Statistics System. Additional data, maps, and methodology can be viewed on the Interactive Atlas of Heart Disease and Stroke http://www.cdc.gov/dhdsp/maps/atlas \n **title:** Stroke Mortality Data Among US Adults (35+) by State/Territory and County **description:** 2012 to 2014,

In [79]:
response = model_llm.predict(
  f"""summaries the following text {description}""",
  **parameters
)
print(f"{response.text}")

 **2014-2016:** A comparison between urban and rural areas reveals a distinct pattern of stroke mortality rates, with rural counties generally exhibiting higher rates of mortality in the US. While the difference may be attributed to health care disparities and limited resources in rural areas, its causes are not fully explained. These results suggest a need for targeted interventions and policies to address stroke-related health issues in rural areas.
**2013-2015:** The study identifies a disparity in stroke mortality rates between certain states, with some states exhibiting significantly higher rates of stroke-related mortality. This variation is likely due to factors such as healthcare disparities, population characteristics, and public health initiatives. The findings underscore the need for comprehensive stroke prevention and management strategies, addressing both population health and health system factors.
**2012-2014:** The analysis of stroke mortality rates reveals a gradual de

In [80]:
result = response.text

In [83]:
# return 
print(result)
print(links)
print(reports)
print(title)

 **2014-2016:** A comparison between urban and rural areas reveals a distinct pattern of stroke mortality rates, with rural counties generally exhibiting higher rates of mortality in the US. While the difference may be attributed to health care disparities and limited resources in rural areas, its causes are not fully explained. These results suggest a need for targeted interventions and policies to address stroke-related health issues in rural areas.
**2013-2015:** The study identifies a disparity in stroke mortality rates between certain states, with some states exhibiting significantly higher rates of stroke-related mortality. This variation is likely due to factors such as healthcare disparities, population characteristics, and public health initiatives. The findings underscore the need for comprehensive stroke prevention and management strategies, addressing both population health and health system factors.
**2012-2014:** The analysis of stroke mortality rates reveals a gradual de