In [4]:
from google.cloud import storage
from google.cloud import bigquery
from google.cloud import aiplatform

In [5]:
project = !gcloud config get-value project
project

['vertex-ai-demo-mlops']

In [6]:
PROJECT_ID = project[0]
PROJECT_ID

'vertex-ai-demo-mlops'

In [7]:
REGION = 'us-central1'
REGION

'us-central1'

In [8]:
aiplatform.init(location=REGION)

### Google Cloud Storage

In [9]:
gcs = storage.Client(project=PROJECT_ID)
type(gcs)

google.cloud.storage.client.Client

In [15]:
BUCKET = "cloud-storage-demo-mlops"

In [16]:
gcs.lookup_bucket(BUCKET)

<Bucket: cloud-storage-demo-mlops>

In [17]:
if not gcs.lookup_bucket(BUCKET):
    bucketDef = gcs.bucket(BUCKET)
    bucket = gcs.create_bucket(bucketDef, project=PROJECT_ID, location=REGION)
    print(f'Created Bucket: {gcs.lookup_bucket(bucket).name}')
else:
    bucketDef = gcs.bucket(BUCKET)
    print(f'Bucket already exist: {bucketDef.name}')

Bucket already exist: cloud-storage-demo-mlops


In [18]:
# Define Google Cloud Storage source 
gcs_source = f"gs://{BUCKET}/healthcare-dataset-stroke-data.csv"

# Define the dataset name
dataset_display_name = 'diabetes_gcs'

# Create the dataset inside Vertex AI
create_dataset_task = aiplatform.TabularDataset.create(
    display_name=dataset_display_name,
    gcs_source=gcs_source
)
create_dataset_task.wait()

Creating TabularDataset
Create TabularDataset backing LRO: projects/374793628187/locations/us-central1/datasets/5084640945115234304/operations/1550362866052235264
TabularDataset created. Resource name: projects/374793628187/locations/us-central1/datasets/5084640945115234304
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/374793628187/locations/us-central1/datasets/5084640945115234304')


In [19]:
# Get dataset
dataset = aiplatform.TabularDataset(create_dataset_task.resource_name)
dataset

<google.cloud.aiplatform.datasets.tabular_dataset.TabularDataset object at 0x7f8b99004fd0> 
resource name: projects/374793628187/locations/us-central1/datasets/5084640945115234304

### BigQuery

In [20]:
bq = bigquery.Client(project=PROJECT_ID)
type(bq)

google.cloud.bigquery.client.Client

In [21]:
datasets = list(bq.list_datasets())
for d in datasets:
    print(d.dataset_id)

In [22]:
BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'healthcare'
BQ_TABLE = 'stroke'

In [23]:
ds = bigquery.Dataset(f"{BQ_PROJECT}.{BQ_DATASET}")
ds.location = REGION
ds = bq.create_dataset(dataset=ds, exists_ok = True)

In [24]:
datasets = list(bq.list_datasets())
for d in datasets:
    print(d.dataset_id)

healthcare


#### Crear tabla de BQ

In [25]:
from google.cloud.exceptions import NotFound

In [28]:
file = "raw_datasets/healthcare-dataset-stroke-data.csv"

In [29]:
try:
    table = bq.get_table(f'{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}')
    if table:
        print(f'The table already exists: {BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}')
except NotFound as error:
    print(f'Creating Table ...')
    destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}")
    job_config = bigquery.LoadJobConfig(
        write_disposition = 'WRITE_TRUNCATE',
        source_format = bigquery.SourceFormat.CSV,
        autodetect = True
    )
    job = bq.load_table_from_uri(f"gs://{bucketDef.name}/{file}", destination, job_config = job_config)
    job.result()
    print(f'Finished creating table: {BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}')

Creating Table ...
Finished creating table: vertex-ai-demo-mlops.healthcare.stroke


In [31]:
# Define BigQuery source 
bq_source = 'bq://vertex-ai-demo-mlops.healthcare.stroke'

# Define the dataset name
dataset_display_name = 'diabetes_bq'

# Create the dataset inside Vertex AI
create_dataset_task = aiplatform.TabularDataset.create(
    display_name=dataset_display_name,
    bq_source=bq_source
)
create_dataset_task.wait()

Creating TabularDataset
Create TabularDataset backing LRO: projects/374793628187/locations/us-central1/datasets/8050261299738705920/operations/967709664261177344
TabularDataset created. Resource name: projects/374793628187/locations/us-central1/datasets/8050261299738705920
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/374793628187/locations/us-central1/datasets/8050261299738705920')


In [32]:
# Get dataset
dataset = aiplatform.TabularDataset(create_dataset_task.resource_name)
dataset

<google.cloud.aiplatform.datasets.tabular_dataset.TabularDataset object at 0x7f8b992932d0> 
resource name: projects/374793628187/locations/us-central1/datasets/8050261299738705920

In [33]:
type(dataset)

google.cloud.aiplatform.datasets.tabular_dataset.TabularDataset

### Lectura de Datos

In [35]:
query = f"""
SELECT *
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
"""
print(query)


SELECT *
FROM `vertex-ai-demo-mlops.healthcare.stroke`



In [36]:
df = bq.query(query=query).to_dataframe()
df.shape

(5110, 12)

In [38]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,64128,Male,10.0,0,0,False,children,Urban,63.08,20.5,smokes,0
1,4833,Female,12.0,0,0,False,children,Rural,207.45,25.4,smokes,0
2,42821,Female,13.0,0,0,False,Private,Rural,60.69,24.0,smokes,0
3,37395,Female,16.0,0,0,False,Private,Urban,63.63,20.0,smokes,0
4,63312,Male,16.0,0,0,False,Private,Urban,80.55,23.5,smokes,0
