In [1]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from hashlib import md5
from typing import List
from google.cloud import bigquery
from google.oauth2 import service_account

key_path = "/root/.creds/kphil-sa.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)


# **** SETUP ****

DATA_DIR = "../data/"
DEFAULT_BOSTON_CRIME_FILE = os.path.join(DATA_DIR, "boston_crime_data.csv")
DEFAULT_PORTLAND_CRIME_FILE = os.path.join(DATA_DIR, "portland_crime_data.csv")
PROJECT_NAME = "team-week2"
BOSTON_DATASET_NAME = "boston"
PORTLAND_DATASET_NAME = "portland"


# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'boston_crime': {
        'table_name': 'boston_crime',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('incident_number', 'string', mode='REQUIRED'),
            bigquery.SchemaField('offense_code', 'int64', mode='REQUIRED'),
            bigquery.SchemaField('offense_code_group', 'string', mode='REQUIRED'),
            bigquery.SchemaField('offense_description', 'string', mode='NULLABLE'),
            bigquery.SchemaField('district', 'string', mode='REQUIRED'),
            bigquery.SchemaField('reporting_area', 'int64',mode='NULLABLE'),
            bigquery.SchemaField('shooting', 'string',mode='NULLABLE'),
            bigquery.SchemaField('occured_on_date','datetime', mode='REQUIRED'),
            bigquery.SchemaField('year', 'date', mode='REQUIRED'),
            bigquery.SchemaField('month', 'date', mode='REQUIRED'),
            bigquery.SchemaField('day_of_week', 'date', mode='REQUIRED'),
            bigquery.SchemaField('hour', 'date', mode='REQUIRED'),
            bigquery.SchemaField('ucr_part','string', mode='NULLABLE'),
            bigquery.SchemaField('street', 'string', mode='REQUIRED'),
            bigquery.SchemaField('a_lat','int64', mode='REQUIRED'),
            bigquery.SchemaField('long', 'int64', mode='REQUIRED'),
            bigquery.SchemaField('location', 'int64', mode='REQUIRED')
        ]
    },
    'portland_crime': {
        'table_name': 'portland_crime',
        'schema': [
            # indexes are written if only named in the schema
            # bigquery.SchemaField('product_id', 'int64', mode='REQUIRED'),
            # bigquery.SchemaField('product_name', 'string', mode='REQUIRED'),
            # bigquery.SchemaField('category', 'string', mode='NULLABLE'),
            # bigquery.SchemaField('unit', 'string', mode='NULLABLE'),
            # bigquery.SchemaField('unit_price', 'float', mode='REQUIRED')
        ]
    }
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
# client = bigquery.Client()

logger.info(f"Setup Completed")

[DEBUG][2023-01-23 03:31:22,241][3294770189:0082] : Creating bigquery client
[INFO ][2023-01-23 03:31:22,242][3294770189:0085] : Setup Completed


In [3]:

# Crime data file name
filename = DEFAULT_SF_CRIME_FILE
logger.debug(f"attempting to process: {filename}")

# Check if file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# Load crimes file into dataframe
df = pd.read_csv(filename, header=0)
logger.info(f"loaded {len(df.index)} rows from: {filename}")

df.columns = df.columns.str.lower().str.replace('-|/| ', '_')
df.columns = df.columns.str.replace('(', '_')
df.columns = df.columns.str.replace(')', '_')

# *** always perform check at the end ***
# check schema: contains all expected columns?
# expected_columns = ['sale_id', 'receipt_num', 'receipt_date', 'name', 'address', 'product_name', 'category', 'amount', 'unit', 'unit_price']
# for col in expected_columns:
#     assert col in list(df.columns), f"Data file missing required column: {col}"

# assign & remember receipts dataframe
sf_crime_df = df
sf_crime_df.head()

[DEBUG][2023-01-19 09:33:55,709][3064238265:0003] : attempting to process: ../data/sf_crime_reports_2018_to_present.csv
[INFO ][2023-01-19 09:34:00,283][3064238265:0012] : loaded 680655 rows from: ../data/sf_crime_reports_2018_to_present.csv


  
  from ipykernel import kernelapp as app
  


Unnamed: 0,incident_datetime,incident_date,incident_time,incident_year,incident_day_of_week,report_datetime,row_id,incident_id,incident_number,cad_number,...,longitude,point,neighborhoods,esncag___boundary_file,central_market_tenderloin_boundary_polygon___updated,civic_center_harm_reduction_project_boundary,hsoc_zones_as_of_2018_06_05,invest_in_neighborhoods__iin__areas,current_supervisor_districts,current_police_districts
0,2021/07/25 12:00:00 AM,2021/07/25,00:00,2021,Sunday,2021/07/25 01:41:00 PM,105718906372,1057189,216105573,,...,,,,,,,,,,
1,2022/06/28 11:58:00 PM,2022/06/28,23:58,2022,Tuesday,2022/06/28 11:58:00 PM,116554371012,1165543,220264913,,...,,,,,,,,,,
2,2022/03/11 10:30:00 AM,2022/03/11,10:30,2022,Friday,2022/03/11 08:03:00 PM,113048071000,1130480,226040232,,...,,,,,,,,,,
3,2021/05/15 05:47:00 PM,2021/05/15,17:47,2021,Saturday,2021/05/15 05:47:00 PM,103051807043,1030518,210183345,,...,,,,,,,,,,
4,2022/06/28 05:22:00 PM,2022/06/28,17:22,2022,Tuesday,2022/06/28 05:22:00 PM,116535107041,1165351,220361741,,...,,,,,,,,,,


In [4]:
# Create dataset
dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True)

logger.info(f"Created San Francisco dataset: {dataset.full_dataset_id}")

[INFO ][2023-01-19 09:34:02,010][3801838551:0007] : Created San Francisco dataset: team-week2:san_francisco


In [5]:
df = sf_crime_df
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['sf_crime']['table_name']}"

job_config = bigquery.LoadJobConfig(
    create_disposition = 'CREATE_IF_NEEDED', 
    write_disposition = 'WRITE_TRUNCATE',
    autodetect=True
    )
logger.info(f"loading table: '{table_name}'")
job = client.load_table_from_dataframe(df, destination=table_name, job_config=job_config)
job.result()        # wait for the job to finish
# get the resulting table
table = client.get_table(table_name)
logger.info(f"loaded {table.num_rows} rows into {table.full_table_id}")

[INFO ][2023-01-19 09:34:02,112][1673120583:0009] : loading table: 'team-week2.san_francisco.sf_crime'
[INFO ][2023-01-19 09:35:56,693][1673120583:0014] : loaded 680655 rows into team-week2:san_francisco.sf_crime


In [6]:
df.dtypes

incident_datetime                                        object
incident_date                                            object
incident_time                                            object
incident_year                                             int64
incident_day_of_week                                     object
report_datetime                                          object
row_id                                                    int64
incident_id                                               int64
incident_number                                           int64
cad_number                                              float64
report_type_code                                         object
report_type_description                                  object
filed_online                                             object
incident_code                                             int64
incident_category                                        object
incident_subcategory                    