In [7]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from google.cloud import bigquery
from google.oauth2 import service_account

key_path = "/home/philiprobertovich/.creds/team-week3.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)


# **** SETUP ****

DATA_DIR = "./data/"
DEFAULT_TORNADOES_FILE = os.path.join(DATA_DIR, "tornadoes.csv")
PROJECT_NAME = "team-week3"
TORNADOES_DATASET_NAME = "tornadoes"



# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'tornadoes': {
        'table_name': 'tornadoes',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('incident_number', 'string', mode='REQUIRED'),
            
        ]
    }
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level

logger.debug(f"Creating bigquery client")

logger.info(f"Setup Completed")

[DEBUG][2023-02-14 12:29:55,845][2275407497:0051] : Creating bigquery client
[INFO ][2023-02-14 12:29:55,856][2275407497:0053] : Setup Completed


In [8]:
# tornadoes data file name
filename = DEFAULT_TORNADOES_FILE

# check if the file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# load into dataframe
df = pd.read_csv(
    filename, 
    header=0, 
    on_bad_lines='warn'
    )
logger.info(f"loaded {len(df.index)} rows from {filename}")


# check schema: contains all expected columns?
expected_columns = [
    'om',
    'yr',
    'mo',
    'dy',
    'date',
    'time',
    'tz',
    'st',
    'stf',
    'stn',
    'mag',
    'inj',
    'fat',
    'loss',
    'closs',
    'slat',
    'slon',
    'elat',
    'elon',
    'len',
    'wid',
    'ns',
    'sn',
    'f1',
    'f2',
    'f3',
    'f4',
    'fc'
]
for col in expected_columns:
    assert col in list(df.columns), f"Data file missing required column: {col}"

# assign & remember tornadoes dataframe
tornadoes_df = df
display(tornadoes_df.head(10))

# log data column data types
logger.debug(df.dtypes)

[INFO ][2023-02-14 14:01:55,293][3571555385:0015] : loaded 68868 rows from ./data/tornadoes.csv


Unnamed: 0,om,yr,mo,dy,date,time,tz,st,stf,stn,...,len,wid,ns,sn,sg,f1,f2,f3,f4,fc
0,192,1950,10,1,1950-10-01,21:00:00,3,OK,40,23,...,15.8,10,1,1,1,25,0,0,0,0
1,193,1950,10,9,1950-10-09,02:15:00,3,NC,37,9,...,2.0,880,1,1,1,47,0,0,0,0
2,195,1950,11,20,1950-11-20,02:20:00,3,KY,21,1,...,0.1,10,1,1,1,177,0,0,0,0
3,196,1950,11,20,1950-11-20,04:00:00,3,KY,21,2,...,0.1,10,1,1,1,209,0,0,0,0
4,197,1950,11,20,1950-11-20,07:30:00,3,MS,28,14,...,2.0,37,1,1,1,101,0,0,0,0
5,194,1950,11,4,1950-11-04,17:00:00,3,PA,42,5,...,15.9,100,1,1,1,71,11,0,0,0
6,198,1950,12,2,1950-12-02,15:00:00,3,IL,17,7,...,18.8,50,1,1,1,119,117,0,0,0
7,199,1950,12,2,1950-12-02,16:00:00,3,IL,17,8,...,18.0,200,1,1,1,119,5,0,0,0
8,200,1950,12,2,1950-12-02,16:25:00,3,AR,5,12,...,7.8,10,1,1,1,65,0,0,0,0
9,201,1950,12,2,1950-12-02,17:30:00,3,IL,17,9,...,9.6,50,1,1,1,157,0,0,0,0


[DEBUG][2023-02-14 14:01:55,541][3571555385:0057] : om         int64
yr         int64
mo         int64
dy         int64
date      object
time      object
tz         int64
st        object
stf        int64
stn        int64
mag        int64
inj        int64
fat        int64
loss     float64
closs    float64
slat     float64
slon     float64
elat     float64
elon     float64
len      float64
wid        int64
ns         int64
sn         int64
sg         int64
f1         int64
f2         int64
f3         int64
f4         int64
fc         int64
dtype: object
