In [26]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from hashlib import md5
from typing import List
from google.cloud import bigquery
from google.oauth2 import service_account

key_path = "/root/.creds/kphil-sa.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)


# **** SETUP ****

DATA_DIR = "../data/"
DEFAULT_SF_CRIME_FILE = os.path.join(DATA_DIR, "sf_crime_reports_2018_to_present.csv")
PROJECT_NAME = "team-week2"
DATASET_NAME = "sanfran"


# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'sf_crime': {
        'table_name': 'sf_crime',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('product_id', 'int64', mode='REQUIRED'),
            bigquery.SchemaField('product_name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('category', 'string', mode='NULLABLE'),
            bigquery.SchemaField('unit', 'string', mode='NULLABLE'),
            bigquery.SchemaField('unit_price', 'float', mode='REQUIRED'),
        ]
    }
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

[DEBUG][2023-01-17 16:27:52,326][1878978208:0057] : Creating bigquery client
[INFO ][2023-01-17 16:27:52,358][1878978208:0060] : Setup Completed


In [27]:

# receipts data file name
filename = DEFAULT_SF_CRIME_FILE
logger.debug(f"attempting to process: {filename}")

# *** always perform checks first ***
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# load into dataframe
df = pd.read_csv(filename, header=0)
logger.info(f"loaded {len(df.index)} rows from: {filename}")

# *** always perform check at the end ***
# check schema: contains all expected columns?
# expected_columns = ['sale_id', 'receipt_num', 'receipt_date', 'name', 'address', 'product_name', 'category', 'amount', 'unit', 'unit_price']
# for col in expected_columns:
#     assert col in list(df.columns), f"Data file missing required column: {col}"

# assign & remember receipts dataframe
sf_crime_df = df
display(sf_crime_df.head())

[DEBUG][2023-01-17 16:29:21,801][1933141567:0003] : attempting to process: ../data/sf_crime_reports_2018_to_present.csv
[INFO ][2023-01-17 16:29:54,028][1933141567:0012] : loaded 680655 rows from: ../data/sf_crime_reports_2018_to_present.csv


Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,CAD Number,...,Longitude,Point,Neighborhoods,ESNCAG - Boundary File,Central Market/Tenderloin Boundary Polygon - Updated,Civic Center Harm Reduction Project Boundary,HSOC Zones as of 2018-06-05,Invest In Neighborhoods (IIN) Areas,Current Supervisor Districts,Current Police Districts
0,2021/07/25 12:00:00 AM,2021/07/25,00:00,2021,Sunday,2021/07/25 01:41:00 PM,105718906372,1057189,216105573,,...,,,,,,,,,,
1,2022/06/28 11:58:00 PM,2022/06/28,23:58,2022,Tuesday,2022/06/28 11:58:00 PM,116554371012,1165543,220264913,,...,,,,,,,,,,
2,2022/03/11 10:30:00 AM,2022/03/11,10:30,2022,Friday,2022/03/11 08:03:00 PM,113048071000,1130480,226040232,,...,,,,,,,,,,
3,2021/05/15 05:47:00 PM,2021/05/15,17:47,2021,Saturday,2021/05/15 05:47:00 PM,103051807043,1030518,210183345,,...,,,,,,,,,,
4,2022/06/28 05:22:00 PM,2022/06/28,17:22,2022,Tuesday,2022/06/28 05:22:00 PM,116535107041,1165351,220361741,,...,,,,,,,,,,


In [None]:
# Create dataset

In [31]:
df = sf_crime_df
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['sf_crime']['table_name']}"

job_config = bigquery.LoadJobConfig(
    create_disposition = 'CREATE_IF_NEEDED', 
    write_disposition = 'WRITE_TRUNCATE',
    autodetect=True
    )
logger.info(f"loading table: '{table_name}'")
job = client.load_table_from_dataframe(df, destination=table_name, job_config=job_config)
job.result()        # wait for the job to finish
# get the resulting table
table = client.get_table(table_name)
logger.info(f"loaded {table.num_rows} rows into {table.full_table_id}")

[INFO ][2023-01-17 16:49:08,808][1673120583:0009] : loading table: 'team-week2.sanfran.sf_crime'


NotFound: 404 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/deb-01-372120/jobs?uploadType=resumable: Not found: Dataset team-week2:sanfran