#### Define table schemas 

In [111]:
import os
import sys
import pandas as pd
import logging
from typing import List
from google.cloud import bigquery 
from google.oauth2 import service_account

key_path = "/home/alex/.creds/salex-sa.json"
credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

DATA_DIR = "../data"
PROJECT_NAME = "team-week2"

# **** TABLE SCHEMAS ****
TABLE_METADATA = {
   ## 'Raw' Data (just renaming columns)
   'chi_crimes': {
       'dataset_name':'chicago',
       'table_name': 'chi_crimes',
       'schema': [
           bigquery.SchemaField('crime_id', 'INTEGER', mode='REQUIRED'),
           bigquery.SchemaField('case_number', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('date_time', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('block', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('IUCR_code', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('primary_IUCR', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('secondary_IUCR', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('location_description', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('arrest', 'BOOL', mode='NULLABLE'),
           bigquery.SchemaField('domestic', 'BOOL', mode='NULLABLE'),
           bigquery.SchemaField('beat', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('district', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('ward', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('community_area', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('FBI_code', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('x_coordinate', 'FLOAT', mode='NULLABLE'),
           bigquery.SchemaField('y_coordinate', 'FLOAT', mode='NULLABLE'),
           bigquery.SchemaField('year', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('updated_on', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('lat', 'FLOAT', mode='NULLABLE'),
           bigquery.SchemaField('lon', 'FLOAT', mode='NULLABLE'), 
           bigquery.SchemaField('location', 'STRING', mode='NULLABLE'),
       ]
   }, 'den_crimes': {
        'dataset_name':'denver',
        'table_name':'den_crimes',
        'schema': [
            bigquery.SchemaField('incident_id', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('offense_id', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('offense_code', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('offense_code_ext', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('offense_type_id', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('offense_cat_id', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('first_occurrence', 'DATETIME', mode='REQUIRED'),
            bigquery.SchemaField('last_occurrence', 'DATETIME', mode='REQUIRED'),
            bigquery.SchemaField('reported_date', 'DATETIME', mode='REQUIRED'),
            bigquery.SchemaField('address', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('geo_x', 'FLOAT', mode='REQUIRED'),
            bigquery.SchemaField('geo_y', 'FLOAT', mode='REQUIRED'),
            bigquery.SchemaField('geo_lon', 'FLOAT', mode='REQUIRED'),
            bigquery.SchemaField('geo_lat', 'FLOAT', mode='REQUIRED'),
            bigquery.SchemaField('district_id', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('precinct_id', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('neighborhood_id', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('is_crime', 'BOOL', mode='REQUIRED'), 
            bigquery.SchemaField('is_traffic', 'BOOL', mode='REQUIRED'), 
            bigquery.SchemaField('victim_count', 'INTEGER', mode='REQUIRED')
          ] 
   }, 'off_codes':{
        'dataset_name':'denver',
        'table_name':'off_codes',
        'schema': [
            bigquery.SchemaField('object_id', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('offense_code', 'INTEGER', mode='REQUIRED'),
            bigquery.SchemaField('offense_code_ext', 'INTEGER', mode='NULLABLE'),
            bigquery.SchemaField('offense_type_id', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('offense_type_name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('offense_cat_id', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('offense_cat_name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('is_crime', 'BOOL', mode='NULLABLE'),
            bigquery.SchemaField('is_traffic', 'BOOL', mode='NULLABLE')

        ]
   }
}

# **** BIGQUERY CLIENT ****
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# **** CREATE TABLES  ****
for table_name in TABLE_METADATA:
  # Create dataset (if needed)
  dataset_name = TABLE_METADATA[table_name]['dataset_name']
  dataset_id = f"{PROJECT_NAME}.{dataset_name}"
  dataset = bigquery.Dataset(dataset_id)
  dataset.location = "US"
  dataset = client.create_dataset(dataset, exists_ok=True)
  # Create table
  full_table_id = f"{PROJECT_NAME}.{dataset_name}.{table_name}"
  schema = TABLE_METADATA[table_name]['schema']
  table = bigquery.Table(full_table_id, schema = schema)
  try:
    client.create_table(table)
    print(f"Created table: {table_name}")
  except:
    print(f"Did not create table {table_name}. Already exists?")
  table_ref = client.get_table(table)
  for column in table_ref.schema:
    print(f"\t{column.name}\t{column.field_type}") 
  print("\n")


Created table: chi_crimes
	crime_id	INTEGER
	case_number	STRING
	date_time	STRING
	block	STRING
	IUCR_code	STRING
	primary_IUCR	STRING
	secondary_IUCR	STRING
	location_description	STRING
	arrest	BOOLEAN
	domestic	BOOLEAN
	beat	INTEGER
	district	INTEGER
	ward	INTEGER
	community_area	INTEGER
	FBI_code	STRING
	x_coordinate	FLOAT
	y_coordinate	FLOAT
	year	INTEGER
	updated_on	STRING
	lat	FLOAT
	lon	FLOAT
	location	STRING


Did not create table den_crimes. Already exists?
	incident_id	INTEGER
	offense_id	INTEGER
	offense_code	INTEGER
	offense_code_ext	INTEGER
	offense_type_id	STRING
	offense_cat_id	STRING
	first_occurrence	DATETIME
	last_occurrence	DATETIME
	reported_date	DATETIME
	address	STRING
	geo_x	FLOAT
	geo_y	FLOAT
	geo_lon	FLOAT
	geo_lat	FLOAT
	district_id	INTEGER
	precinct_id	INTEGER
	neighborhood_id	STRING
	is_crime	BOOLEAN
	is_traffic	BOOLEAN
	victim_count	INTEGER


Did not create table off_codes. Already exists?
	object_id	INTEGER
	offense_code	INTEGER
	offense_code_ext	INTEGER
	

In [112]:
df = pd.read_csv("../data/chicago/2018.csv")

In [102]:
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11552667,JC100123,01/01/2019 12:00:00 AM,004XX N STATE ST,890,THEFT,FROM BUILDING,RESTAURANT,False,False,...,42.0,8,06,1176302.0,1903096.0,2019,01/10/2019 03:16:50 PM,41.889453,-87.627995,"(41.889453169, -87.627994833)"
1,11552674,JC100085,01/01/2019 12:00:00 AM,092XX S NORMAL AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,21.0,73,07,1174586.0,1843723.0,2019,01/10/2019 03:16:50 PM,41.726566,-87.636066,"(41.726566477, -87.636065622)"
2,11552709,JC100020,01/01/2019 12:00:00 AM,044XX S WASHTENAW AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,15.0,58,08B,1159112.0,1875020.0,2019,01/10/2019 03:16:50 PM,41.81278,-87.691894,"(41.812780011, -87.691893746)"
3,11552758,JC100058,01/01/2019 12:00:00 AM,063XX S MARSHFIELD AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,...,16.0,67,14,1166414.0,1862607.0,2019,01/10/2019 03:16:50 PM,41.778565,-87.665464,"(41.77856457, -87.665463557)"
4,11553168,JC100745,01/01/2019 12:00:00 AM,008XX N MICHIGAN AVE,890,THEFT,FROM BUILDING,RESTAURANT,False,False,...,2.0,8,06,1177330.0,1906499.0,2019,01/10/2019 03:16:50 PM,41.898768,-87.624116,"(41.898767916, -87.624116333)"


In [113]:
# df['date_time'] = pd.to_datetime(df['date_time'])
df.dtypes

ID                        int64
Case Number              object
Date                     object
Block                    object
IUCR                     object
Primary Type             object
Description              object
Location Description     object
Arrest                     bool
Domestic                   bool
Beat                      int64
District                  int64
Ward                    float64
Community Area            int64
FBI Code                 object
X Coordinate            float64
Y Coordinate            float64
Year                      int64
Updated On               object
Latitude                float64
Longitude               float64
Location                 object
dtype: object

In [115]:
# job_config = bigquery.LoadJobConfig(
        # create_disposition="CREATE_IF_NEEDED",
        # write_disposition="WRITE_APPEND",
        # schema=TABLE_METADATA['chi_crimes']['schema']
#     )
# 
# for year in range(18,23):
#   filepath = f"{DATA_DIR}/chicago/20{year}.csv"
#   print(filepath)
#   df = pd.read_csv(filepath)
#   df.columns = [col.lower().replace(" ","_") for col in df.columns]
#   df.rename({'id':'crime_id', 
        #   'date':'date_time',
        #   'iucr':'IUCR_code',
        #   'primary_type':'primary_IUCR',
        #   'description':'secondary_IUCR',
        #   'fbi_code':'FBI_code',
        #   'latitude':'lat',
        #   'longitude':'lon'
        #   }, axis=1, inplace=True)
#   job = client.load_table_from_dataframe(df, destination=f'{PROJECT_NAME}.chicago.chi_crimes', job_config=job_config)
#   job.result()

../data/chicago/2018.csv
../data/chicago/2019.csv
../data/chicago/2020.csv
../data/chicago/2021.csv
../data/chicago/2022.csv


#### Create/load to staging table for chicago crime data  

In [53]:
# def loadStagingTable(file_name): 
#   dataset_name = "chicago"
#   staging_table_name = "tmp_chi_crime"
#   full_table_id = f"{PROJECT_NAME}.{dataset_name}.{staging_table_name}"

#   df = pd.read_csv(file_name)
#   df[['created_at']] = None
#   df[['modified_at']] = None

#   job_config = bigquery.LoadJobConfig(
#     schema=TABLE_METADATA['chi_crimes']['schema'], 
#     create_disposition="CREATE_IF_NEEDED",
#     write_disposition="WRITE_TRUNCATE",
#     destination_table_description="Staging table for loading chicago crime data",
#   )

#   job = client.load_table_from_dataframe(df, full_table_id, job_config=job_config)
#   job.result()
#   print(f"loaded {job.output_rows} rows into {job.destination}")

#   query = f"""
# MERGE INTO team-week2.chicago.chi_crimes as trg
# USING team-week2.chicago.tmp_chi_crime as src
# ON
#   trg.license_plate = src.license_plate
# WHEN MATCHED THEN 
#   UPDATE SET year = src.year,
#   make = src.make,
#   model = src.model,
#   first_name = src.first_name,
#   last_name = src.last_name,
#   birth_date = src.birth_date,
#   street_address = src.street_address,
#   city = src.city,
#   state = src.state,
#   zip = src.zip, 
#   modified_at = CURRENT_TIMESTAMP
# WHEN NOT MATCHED THEN 
# INSERT (
#     license_plate,
#     vin,
#     year,
#     make,
#     model,
#     first_name,
#     last_name,
#     birth_date,
#     street_address,
#     city,
#     state,
#     zip, 
#     created_at
# ) VALUES (
#     src.license_plate,
#     src.vin,
#     src.year,
#     src.make,
#     src.model,
#     src.first_name,
#     src.last_name,
#     src.birth_date,
#     src.street_address,
#     src.city,
#     src.state,
#     src.zip, 
#     CURRENT_TIMESTAMP
#     ); 
# """
# result = client.query(query)


399572

In [47]:
with open("../data/denver/crime.csv", "r", encoding="windows-1252") as file:
  den_crimes = pd.read_csv(file)