In [1]:
# !pip install --upgrade google-cloud-bigquery
# !pip install --upgrade google-cloud-storage
# !pip install db-dtypes

In [2]:
import os
from google.cloud import storage
from google.cloud import bigquery

In [3]:
# Define the path to the environment key
key = './ml-pipeline-key.json'

In [4]:
# If you want to change the environment variable
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key

In [5]:
# Create a client using the default credentials
client = storage.Client()

# List all buckets
buckets = list(client.list_buckets())
for bucket in buckets:
    print(bucket.name)

In [6]:
# If you do not want to modify environment variables
# Create a client using the specified key file
client = storage.Client.from_service_account_json(key)

---
# Lets perform some queries

In [8]:
# Create a Bigery.Client.from_service_account_json(key)
client = bigquery.Client.from_service_account_json(key)

# Define your query
query = """
    SELECT *
    FROM `MIMIC.PATIENTS`
    LIMIT 10
"""

# Run the query
query_job = client.query(query)

# Wait for the job to complete and get the results
results = query_job.result()

# Convert the results to a Pandas DataFrame
df = query_job.to_dataframe()

# Print the results
for result in results:
    print(result)

Row((234, 249, 'F', datetime.datetime(2075, 3, 13, 0, 0, tzinfo=datetime.timezone.utc), None, None, None, 0), {'ROW_ID': 0, 'SUBJECT_ID': 1, 'GENDER': 2, 'DOB': 3, 'DOD': 4, 'DOD_HOSP': 5, 'DOD_SSN': 6, 'EXPIRE_FLAG': 7})
Row((238, 253, 'F', datetime.datetime(2089, 11, 26, 0, 0, tzinfo=datetime.timezone.utc), None, None, None, 0), {'ROW_ID': 0, 'SUBJECT_ID': 1, 'GENDER': 2, 'DOB': 3, 'DOD': 4, 'DOD_HOSP': 5, 'DOD_SSN': 6, 'EXPIRE_FLAG': 7})
Row((242, 258, 'F', datetime.datetime(2124, 9, 19, 0, 0, tzinfo=datetime.timezone.utc), None, None, None, 0), {'ROW_ID': 0, 'SUBJECT_ID': 1, 'GENDER': 2, 'DOB': 3, 'DOD': 4, 'DOD_HOSP': 5, 'DOD_SSN': 6, 'EXPIRE_FLAG': 7})
Row((243, 260, 'F', datetime.datetime(2105, 3, 23, 0, 0, tzinfo=datetime.timezone.utc), None, None, None, 0), {'ROW_ID': 0, 'SUBJECT_ID': 1, 'GENDER': 2, 'DOB': 3, 'DOD': 4, 'DOD_HOSP': 5, 'DOD_SSN': 6, 'EXPIRE_FLAG': 7})
Row((247, 264, 'F', datetime.datetime(2162, 11, 30, 0, 0, tzinfo=datetime.timezone.utc), None, None, None, 0), 

In [11]:
df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00+00:00,NaT,NaT,NaT,0
1,238,253,F,2089-11-26 00:00:00+00:00,NaT,NaT,NaT,0
2,242,258,F,2124-09-19 00:00:00+00:00,NaT,NaT,NaT,0
3,243,260,F,2105-03-23 00:00:00+00:00,NaT,NaT,NaT,0
4,247,264,F,2162-11-30 00:00:00+00:00,NaT,NaT,NaT,0


---
## Other approach

In [33]:
# !pip install --upgrade bigframes
# !pip install dask dask-bigquery google-cloud-bigquery
# !pip install modin
# !pip install google-cloud-bigquery
# !pip install --upgrade pandas==2.2.*

In [12]:
import bigframes as bf
import bigframes.pandas as bpd

In [13]:
import pandas as pd
import dask.dataframe as dd

In [14]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, mean as count, when, udf, regexp_replace, coalesce, lit
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F

In [16]:
import matplotlib.pyplot as plt
import math
from datetime import datetime
import seaborn as sns

In [17]:
# Define the projet's settings
project = "ml-pipeline-455620"
bf.options.bigquery.projetc=project
bf.options.bigquery.location="EU"

In [18]:
# Lets load the patients
patients = bpd.read_gbq(f"{project}.MIMIC.PATIENTS")
patients.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,33888,51786,F,2090-04-16 00:00:00+00:00,,,,0
1,23653,25061,M,2114-07-11 00:00:00+00:00,2192-12-02 00:00:00+00:00,2192-12-02 00:00:00+00:00,2192-12-02 00:00:00+00:00,1
2,44113,90788,M,2094-04-24 00:00:00+00:00,,,,0
3,3452,3652,F,2042-07-26 00:00:00+00:00,2123-10-14 00:00:00+00:00,2123-10-14 00:00:00+00:00,2123-10-14 00:00:00+00:00,1
4,10760,11381,M,2143-04-22 00:00:00+00:00,,,,0


In [19]:
patients.shape

(46520, 8)

In [20]:
bf.pandas.close_session()