In [7]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from google.cloud import bigquery
from google.oauth2 import service_account

key_path = "/home/philiprobertovich/.creds/team-week3.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)


# **** SETUP ****

DATA_DIR = "./data/"
DEFAULT_TORNADOES_FILE = os.path.join(DATA_DIR, "tornadoes.csv")
PROJECT_NAME = "team-week3"
TORNADOES_DATASET_NAME = "tornadoes"



# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'tornadoes': {
        'table_name': 'tornadoes',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('incident_number', 'string', mode='REQUIRED'),
            
        ]
    }
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level

logger.debug(f"Creating bigquery client")

logger.info(f"Setup Completed")

[DEBUG][2023-02-14 12:29:55,845][2275407497:0051] : Creating bigquery client
[INFO ][2023-02-14 12:29:55,856][2275407497:0053] : Setup Completed


In [25]:
# tornadoes data file name
filename = DEFAULT_TORNADOES_FILE

# check if the file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# load into dataframe
df = pd.read_csv(
    filename, 
    header=0, 
    on_bad_lines='warn'
    )
logger.info(f"loaded {len(df.index)} rows from {filename}")


# check schema: contains all expected columns?
expected_columns = [
    'om',
    'yr',
    'mo',
    'dy',
    'date',
    'time',
    'tz',
    'st',
    'stf',
    'stn',
    'mag',
    'inj',
    'fat',
    'loss',
    'closs',
    'slat',
    'slon',
    'elat',
    'elon',
    'len',
    'wid',
    'ns',
    'sn',
    'f1',
    'f2',
    'f3',
    'f4',
    'fc'
]
for col in expected_columns:
    assert col in list(df.columns), f"Data file missing required column: {col}"

# assign & remember tornadoes dataframe
# tornadoes_df = df
display(df.tail(10))

# log data column data types

[INFO ][2023-02-14 15:59:21,331][304960853:0015] : loaded 68868 rows from ./data/tornadoes.csv


Unnamed: 0,om,yr,mo,dy,date,time,tz,st,stf,stn,...,len,wid,ns,sn,sg,f1,f2,f3,f4,fc
68858,620518,2021,9,1,2021-09-01,17:03:00,3,PA,42,0,...,1.14,200,2,1,2,17,0,0,0,0
68859,620519,2021,9,1,2021-09-01,17:32:00,3,NJ,34,0,...,2.61,100,1,1,1,21,0,0,0,0
68860,620520,2021,9,1,2021-09-01,23:30:00,3,MA,25,0,...,0.1,15,1,1,1,1,0,0,0,0
68861,620521,2021,9,2,2021-09-02,18:28:00,3,UT,49,0,...,2.05,350,1,1,1,11,0,0,0,0
68862,620522,2021,9,7,2021-09-07,17:45:00,3,MI,26,0,...,5.33,450,1,1,1,59,0,0,0,0
68863,620523,2021,9,8,2021-09-08,15:32:00,3,PA,42,0,...,0.73,30,1,1,1,109,0,0,0,0
68864,620524,2021,9,8,2021-09-08,15:35:00,3,PA,42,0,...,1.45,30,1,1,1,109,0,0,0,0
68865,620525,2021,9,8,2021-09-08,19:08:00,3,FL,12,0,...,1.45,50,1,1,1,129,0,0,0,0
68866,620526,2021,9,9,2021-09-09,02:10:00,3,CT,9,0,...,0.75,75,1,1,1,13,0,0,0,0
68867,620527,2021,9,9,2021-09-09,17:58:00,3,CA,6,0,...,1.79,9,1,1,1,37,0,0,0,0


In [26]:
# Drop columns
drop_cols = [
  'stf',
  'stn',
  'ns',
  'sn',
  'sg',
  'f1',
  'f2',
  'f3',
  'f4',
  'fc',
]

df = df.drop(columns=drop_cols)

logger.info('Dropped columns.')

df.head()

[INFO ][2023-02-14 15:59:24,134][2866547761:0017] : Dropped columns.


Unnamed: 0,om,yr,mo,dy,date,time,tz,st,mag,inj,fat,loss,closs,slat,slon,elat,elon,len,wid
0,192,1950,10,1,1950-10-01,21:00:00,3,OK,1,0,0,4.0,0.0,36.73,-102.52,36.88,-102.3,15.8,10
1,193,1950,10,9,1950-10-09,02:15:00,3,NC,3,3,0,5.0,0.0,34.17,-78.6,0.0,0.0,2.0,880
2,195,1950,11,20,1950-11-20,02:20:00,3,KY,2,0,0,5.0,0.0,37.37,-87.2,0.0,0.0,0.1,10
3,196,1950,11,20,1950-11-20,04:00:00,3,KY,1,0,0,5.0,0.0,38.2,-84.5,0.0,0.0,0.1,10
4,197,1950,11,20,1950-11-20,07:30:00,3,MS,1,3,0,4.0,0.0,32.42,-89.13,0.0,0.0,2.0,37


In [27]:
# Rename columns
renamed = {
  'om':'tornado_number',
  'yr':'year',
  'mo':'month',
  'dy':'day',
  'tz':'timezone',
  'st':'state',
  'mag':'magnitude',
  'inj':'injuries',
  'fat':'fatalities',
  'loss':'property_loss',
  'closs':'crop_loss',
  'slat':'starting_lat',
  'slon':'starting_lon',
  'elat':'end_lat',
  'elon':'end_lon',
  'len':'length',
  'wid':'width'
}

df = df.rename(columns=renamed)

logger.info('Renamed columns.')

df.head()

[INFO ][2023-02-14 15:59:41,104][1905297358:0024] : Renamed columns.


Unnamed: 0,tornado_number,year,month,day,date,time,timezone,state,magnitude,injuries,fatalities,property_loss,crop_loss,starting_lat,starting_lon,end_lat,end_lon,length,width
0,192,1950,10,1,1950-10-01,21:00:00,3,OK,1,0,0,4.0,0.0,36.73,-102.52,36.88,-102.3,15.8,10
1,193,1950,10,9,1950-10-09,02:15:00,3,NC,3,3,0,5.0,0.0,34.17,-78.6,0.0,0.0,2.0,880
2,195,1950,11,20,1950-11-20,02:20:00,3,KY,2,0,0,5.0,0.0,37.37,-87.2,0.0,0.0,0.1,10
3,196,1950,11,20,1950-11-20,04:00:00,3,KY,1,0,0,5.0,0.0,38.2,-84.5,0.0,0.0,0.1,10
4,197,1950,11,20,1950-11-20,07:30:00,3,MS,1,3,0,4.0,0.0,32.42,-89.13,0.0,0.0,2.0,37
