# Upload Data to Big Query in GCP

In [2]:
import configparser

In [44]:
config = configparser.ConfigParser()
config.read('configuration.properties')

['configuration.properties']

### Update report location

In [5]:
import os
import pandas as pd

In [6]:
data = pd.read_csv("../data/cdc_data.csv", sep="\t")

In [7]:
data

Unnamed: 0,title,link,dataSetCode,dataSetType,description,updateDate
0,DASH YRBSS - HS Cigar Use (by total grade/sex/...,https://data.cdc.gov/Youth-Risk-Behaviors/DASH...,bedg-mmpy,Filtered View,2015-2017. High School Dataset – Including Sex...,27/07/2023
1,Graph of Tobacco Use 3 Months Before and Last ...,https://data.cdc.gov/Maternal-Child-Health/Gra...,mbvg-apdj,Chart,2011. Centers for Disease Control and Prevent...,27/07/2023
2,"CDC Nutrition, Physical Activity, and Obesity ...",https://data.cdc.gov/Nutrition-Physical-Activi...,nxst-x9p4,Dataset,This dataset contains policy data for 50 US st...,25/08/2023
3,"Nutrition, Physical Activity, and Obesity - Be...",https://data.cdc.gov/Nutrition-Physical-Activi...,hn4x-zwk7,Dataset,"This dataset includes data on adult's diet, ph...",07/12/2023
4,Map of HS Cigar Use,https://data.cdc.gov/Youth-Risk-Behaviors/Map-...,a5cb-a2ww,Map,2015-2017. High School Dataset – Including Sex...,27/07/2023
...,...,...,...,...,...,...
235,"PLACES: ZCTA Data (GIS Friendly Format), 2021 ...",https://data.cdc.gov/500-Cities-Places/PLACES-...,9xb7-9z99,Dataset,This dataset contains model-based ZIP Code Tab...,25/08/2023
236,Stroke Mortality Data Among US Adults (35+) by...,https://data.cdc.gov/Heart-Disease-Stroke-Prev...,ua33-yiiu,Dataset,"2017 to 2019, 3-year average. Rates are age-st...",24/08/2023
237,"PLACES: Local Data for Better Health, ZCTA Dat...",https://data.cdc.gov/500-Cities-Places/PLACES-...,fbbf-hgkc,Dataset,This dataset contains model-based ZIP Code tab...,25/08/2023
238,"PLACES: Place Data (GIS Friendly Format), 2021...",https://data.cdc.gov/500-Cities-Places/PLACES-...,cj8b-94cj,Dataset,This dataset contains model-based place (incor...,25/08/2023


In [8]:
def check_file_exists(report_name):
    loc = f'reports/{report_name}.html'
    file_path = f'../{loc}'
    if os.path.exists(file_path):
        return loc
    else:
        return None

In [9]:
data['reportLocation'] = data['dataSetCode'].apply(check_file_exists)

In [10]:
data

Unnamed: 0,title,link,dataSetCode,dataSetType,description,updateDate,reportLocation
0,DASH YRBSS - HS Cigar Use (by total grade/sex/...,https://data.cdc.gov/Youth-Risk-Behaviors/DASH...,bedg-mmpy,Filtered View,2015-2017. High School Dataset – Including Sex...,27/07/2023,
1,Graph of Tobacco Use 3 Months Before and Last ...,https://data.cdc.gov/Maternal-Child-Health/Gra...,mbvg-apdj,Chart,2011. Centers for Disease Control and Prevent...,27/07/2023,
2,"CDC Nutrition, Physical Activity, and Obesity ...",https://data.cdc.gov/Nutrition-Physical-Activi...,nxst-x9p4,Dataset,This dataset contains policy data for 50 US st...,25/08/2023,reports/nxst-x9p4.html
3,"Nutrition, Physical Activity, and Obesity - Be...",https://data.cdc.gov/Nutrition-Physical-Activi...,hn4x-zwk7,Dataset,"This dataset includes data on adult's diet, ph...",07/12/2023,reports/hn4x-zwk7.html
4,Map of HS Cigar Use,https://data.cdc.gov/Youth-Risk-Behaviors/Map-...,a5cb-a2ww,Map,2015-2017. High School Dataset – Including Sex...,27/07/2023,
...,...,...,...,...,...,...,...
235,"PLACES: ZCTA Data (GIS Friendly Format), 2021 ...",https://data.cdc.gov/500-Cities-Places/PLACES-...,9xb7-9z99,Dataset,This dataset contains model-based ZIP Code Tab...,25/08/2023,
236,Stroke Mortality Data Among US Adults (35+) by...,https://data.cdc.gov/Heart-Disease-Stroke-Prev...,ua33-yiiu,Dataset,"2017 to 2019, 3-year average. Rates are age-st...",24/08/2023,
237,"PLACES: Local Data for Better Health, ZCTA Dat...",https://data.cdc.gov/500-Cities-Places/PLACES-...,fbbf-hgkc,Dataset,This dataset contains model-based ZIP Code tab...,25/08/2023,
238,"PLACES: Place Data (GIS Friendly Format), 2021...",https://data.cdc.gov/500-Cities-Places/PLACES-...,cj8b-94cj,Dataset,This dataset contains model-based place (incor...,25/08/2023,


### Embed the Title

In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
import vertexai
from vertexai.language_models import TextGenerationModel

In [12]:
# vertex AI config
project = config['VertexAI']['project']
endpoint_id = config['VertexAI']['endpoint_id']
location = config['VertexAI']['location']
api_endpoint = config['VertexAI']['api_endpoint']
model_pretrained = config['VertexAI']['model_pretrained']

In [13]:
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [14]:
vertexai.init(project=project, location=location)

In [15]:
parameters = {
  "max_output_tokens": 1024,
  "temperature": 0.9,
  "top_p": 1
}

In [16]:
model_llm = TextGenerationModel.from_pretrained(model_pretrained)

In [17]:
model_llm = model_llm.get_tuned_model(f"projects/{project}/locations/{location}/models/{endpoint_id}")

In [27]:
def generateKeywords(text):
  response = model_llm.predict(
    f""" Text:{text} \n\n what disease is being targeted? Return only the disease name as a single word. if you cant find any disease in the text return one important keyword in the text""",
    **parameters
  )
  name = response.text
  
  return name

In [24]:
data["keywords"] = data["title"].apply(generateKeywords)

In [29]:
def removeNoise(text):
  text = text.strip()
  if len(text) > 50:
    return None
  return text

In [30]:
data["keywords"] = data["keywords"].apply(removeNoise)

In [37]:
def generateEmbedding(text):
  if text is None:
    text = "None"
  inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
  with torch.no_grad():
    outputs = model(**inputs)
  embeddings = outputs.last_hidden_state.mean(dim=1)
  embeddings_list = embeddings.tolist()
  return embeddings_list[0]

In [38]:
data["embedding"] = data["keywords"].apply(generateEmbedding)

In [72]:
data

Unnamed: 0,title,link,dataSetCode,dataSetType,description,updateDate,reportLocation,keywords,embedding
0,DASH YRBSS - HS Cigar Use (by total grade/sex/...,https://data.cdc.gov/Youth-Risk-Behaviors/DASH...,bedg-mmpy,Filtered View,2015-2017. High School Dataset – Including Sex...,27/07/2023,,Cigar use,"[0.16420388221740723, -0.22693508863449097, -0..."
1,Graph of Tobacco Use 3 Months Before and Last ...,https://data.cdc.gov/Maternal-Child-Health/Gra...,mbvg-apdj,Chart,2011. Centers for Disease Control and Prevent...,27/07/2023,,Tobacco Use,"[0.14029575884342194, -0.13101698458194733, -0..."
2,"CDC Nutrition, Physical Activity, and Obesity ...",https://data.cdc.gov/Nutrition-Physical-Activi...,nxst-x9p4,Dataset,This dataset contains policy data for 50 US st...,25/08/2023,reports/nxst-x9p4.html,Obesity,"[0.1107550710439682, 0.08884633332490921, -0.0..."
3,"Nutrition, Physical Activity, and Obesity - Be...",https://data.cdc.gov/Nutrition-Physical-Activi...,hn4x-zwk7,Dataset,"This dataset includes data on adult's diet, ph...",07/12/2023,reports/hn4x-zwk7.html,Obesity,"[0.1107550710439682, 0.08884633332490921, -0.0..."
4,Map of HS Cigar Use,https://data.cdc.gov/Youth-Risk-Behaviors/Map-...,a5cb-a2ww,Map,2015-2017. High School Dataset – Including Sex...,27/07/2023,,Cigar use,"[0.16420388221740723, -0.22693508863449097, -0..."
...,...,...,...,...,...,...,...,...,...
235,"PLACES: ZCTA Data (GIS Friendly Format), 2021 ...",https://data.cdc.gov/500-Cities-Places/PLACES-...,9xb7-9z99,Dataset,This dataset contains model-based ZIP Code Tab...,25/08/2023,,GIS,"[0.2590693235397339, -0.32744064927101135, -0...."
236,Stroke Mortality Data Among US Adults (35+) by...,https://data.cdc.gov/Heart-Disease-Stroke-Prev...,ua33-yiiu,Dataset,"2017 to 2019, 3-year average. Rates are age-st...",24/08/2023,,Stroke,"[0.1883343905210495, 0.057887036353349686, -0...."
237,"PLACES: Local Data for Better Health, ZCTA Dat...",https://data.cdc.gov/500-Cities-Places/PLACES-...,fbbf-hgkc,Dataset,This dataset contains model-based ZIP Code tab...,25/08/2023,,Data,"[0.10181767493486404, -0.18418431282043457, -0..."
238,"PLACES: Place Data (GIS Friendly Format), 2021...",https://data.cdc.gov/500-Cities-Places/PLACES-...,cj8b-94cj,Dataset,This dataset contains model-based place (incor...,25/08/2023,,,"[0.3534121513366699, -0.19157488644123077, -0...."


### Upload data to GCP

In [73]:
from google.cloud import bigquery

In [74]:
dataset_id = config['BigQuery']['dataset_id']
table_id = config['BigQuery']['table_id']

In [75]:
data_dict = data.to_dict(orient="records")

In [76]:
bq_client = bigquery.Client()

In [77]:
schema = [
  bigquery.SchemaField("title", "STRING"),
  bigquery.SchemaField("link", "STRING"),
  bigquery.SchemaField("dataSetCode", "STRING"),
  bigquery.SchemaField("dataSetType", "STRING"),
  bigquery.SchemaField("description", "STRING"),
  bigquery.SchemaField("updateDate", "STRING"),
  bigquery.SchemaField("reportLocation", "STRING"),
  bigquery.SchemaField("embedding", "FLOAT64", mode="REPEATED"),
  bigquery.SchemaField("keywords", "STRING")
]

In [78]:
table_ref = bq_client.dataset(dataset_id).table(table_id)

In [79]:
errors = bq_client.insert_rows_json(table_ref, data_dict)

In [80]:
errors

[]