# **MAKE A COPY IF YOU WANT TO RUN THIS NOTEBOOK**
Please don't change this copy of the notebook unless you intend to update/improve the code.

# Cohort Examples
This notebook provides an example of how to add column descriptions to a bigquery table.

## Setup

First, be sure to run the [**`Python Environment Setup notebook`**](https://app.terra.bio/#workspaces/fc-product-demo/Baseline%20Health%20Study/notebooks/launch/Python%20Environment%20Setup.ipynb) in this workspace.

In [1]:
import os
from google.cloud import bigquery

In [None]:
PROJECT = 'PROJECT-NAME-HERE'
DATASET_IDS = ['DATASET-1-HERE', 'DATASET-2-HERE']

# Step 1: Parse the spreadsheets and create dict of table.column_name -> description

# Assuming the csv files containing descriptions are in directory 'source_csvs'
source_directory = 'source_csvs'

# Optional: Mark some files to skip
files_to_skip = ['FILE1.csv', 'FILE2.csv']

descriptions = {}
for file in os.listdir(source_directory):
  if file in files_to_skip:
    continue
  with open(os.path.join(source_directory, file), 'r') as f:
    lines = f.read().splitlines()
    # skip header
    lines = lines[1:]
    for line in lines:
      column_name = line.split(',')[0]
      if not column_name:
        continue
      try:
        description = line.split(',')[2]
      except Exception as e:
        description = ''
        continue
      if not description:
          continue
            
      descriptions[column_name] = description.strip('\"')




In [None]:
# Step 2: Fetch the baseline tables, and change their column descriptions if we have it

client = bigquery.Client(project=PROJECT)

for dataset_id in DATASET_IDS:
  dataset_ref = client.dataset(dataset_id)

  dataset = client.get_dataset(dataset_ref)

  full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id)

  tables = list(client.list_tables(dataset))  # API request(s)
  table_to_column_name = {}
  for table in tables:
    table_ref = dataset_ref.table(table.table_id)
    table_obj = client.get_table(table_ref)
    new_schema = []
    for schema_field in table_obj.schema:
      column_name = schema_field.name
      new_description = descriptions.get(column_name, '')
      new_schema_field = bigquery.SchemaField(schema_field.name,
        schema_field.field_type, 
        schema_field.mode,
        new_description,
        schema_field.fields
      )
      new_schema.append(new_schema_field)
    table_obj.schema = new_schema
    client.update_table(table_obj, ['schema'])
    print('Updated table {}'.format(table.table_id))