In [22]:
from google.cloud import bigquery
from sklearn.datasets import load_breast_cancer
import pandas as pd
from datetime import datetime
client = bigquery.Client()

## Create Raw data table

In [20]:
data_X, data_y = load_breast_cancer(return_X_y=True, as_frame=True)

In [21]:
data_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [23]:
data_X['insert_date'] = pd.to_datetime(datetime.now(), utc=True)
data_X.head(2)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,insert_date
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,2025-06-28 14:49:04.786939+00:00
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,2025-06-28 14:49:04.786939+00:00


In [25]:
schema = []
for col in data_X.columns:
    if col == 'insert_date': 
        schema.append(bigquery.SchemaField(col, "TIMESTAMP"))
        continue
    schema.append(bigquery.SchemaField(col, "FLOAT64"))
schema

[SchemaField('mean radius', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean texture', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean perimeter', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean area', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean smoothness', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean compactness', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean concavity', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean concave points', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean symmetry', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean fractal dimension', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('radius error', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('texture error', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('perimeter error', 'FLOAT64', 'NULLABLE', None, None, (), Non

In [27]:
table_id = "elated-effect-464110-f2.breast_cancer_dataset.raw_data"
table = bigquery.Table(table_id, schema=schema)
table.time_partitioning = bigquery.TimePartitioning(
    type_= bigquery.TimePartitioningType.DAY,
    field="insert_date",  
) 

table = client.create_table(table)
print(
    "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
)

Created table elated-effect-464110-f2.breast_cancer_dataset.raw_data


## Create input_data table

In [28]:
input_data = data_X[['mean radius',
 'mean area',
 'mean concave points',
 'mean fractal dimension',
 'concavity error',
 'concave points error',
 'worst radius',
 'worst area', 
                    'insert_date']]

In [29]:
schema = []
for col in input_data.columns:
    if col == 'insert_date': 
        schema.append(bigquery.SchemaField(col, "TIMESTAMP"))
        continue
    schema.append(bigquery.SchemaField(col, "FLOAT64"))
schema

[SchemaField('mean radius', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean area', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean concave points', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('mean fractal dimension', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('concavity error', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('concave points error', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('worst radius', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('worst area', 'FLOAT64', 'NULLABLE', None, None, (), None),
 SchemaField('insert_date', 'TIMESTAMP', 'NULLABLE', None, None, (), None)]

In [30]:
table_id = "elated-effect-464110-f2.breast_cancer_dataset.input_data"
table = bigquery.Table(table_id, schema=schema)
table.time_partitioning = bigquery.TimePartitioning(
    type_= bigquery.TimePartitioningType.DAY,
    field="insert_date",  
) 
table = client.create_table(table)
print(
    "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
)

Created table elated-effect-464110-f2.breast_cancer_dataset.input_data


## Create predicted dataset

In [14]:
schema.append(bigquery.SchemaField('predictions', "BOOL"))

In [15]:
table_id = "elated-effect-464110-f2.breast_cancer_dataset.final_table"
table = bigquery.Table(table_id, schema=schema)
table = client.create_table(table)
print(
    "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
)

Created table elated-effect-464110-f2.breast_cancer_dataset.final_table


In [16]:
', '.join(['mean radius',
 'mean area',
 'mean concave points',
 'mean fractal dimension',
 'concavity error',
 'concave points error',
 'worst radius',
 'worst area'])

'mean radius, mean area, mean concave points, mean fractal dimension, concavity error, concave points error, worst radius, worst area'