## Setup

In [1]:
import os
import sys
import logging
import pandas as pd
from google.cloud import bigquery
from hashlib import md5
from typing import List


# **** SETUP ****
PROJECT_NAME = "deb-01-372112"
DATASET_NAME = "tickets"


# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'airlines': {
        'table_name': 'airlines',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('airline_iata', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airline_name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airline_icao', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airline_callsign', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airline_country', 'string', mode='REQUIRED'),
        ],
    },
    'airports': {
        'table_name': 'airports',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('airport_iata', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airport_city', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airport_country', 'string', mode='NULLABLE'),
            bigquery.SchemaField('airport_name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airport_icao', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airport_latitude', 'float', mode='REQUIRED'),
            bigquery.SchemaField('airport_longitude', 'float', mode='REQUIRED'),
            bigquery.SchemaField('airport_altitude', 'int64', mode='REQUIRED'),
            bigquery.SchemaField('airport_tz_timezone', 'string', mode='REQUIRED')
        ],
    },
  'passengers': {
        'table_name': 'passengers',
        'schema': [
            bigquery.SchemaField('passenger_sk', 'string', mode='REQUIRED'),
            bigquery.SchemaField('first_name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('last_name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('gender', 'string', mode='REQUIRED'),
            bigquery.SchemaField('birth_date', 'date_time', mode='REQUIRED'),
            bigquery.SchemaField('email', 'string', mode='REQUIRED'),
            bigquery.SchemaField('street', 'string', mode='REQUIRED'),
            bigquery.SchemaField('city', 'string', mode='REQUIRED'),
            bigquery.SchemaField('state', 'string', mode='REQUIRED'),
            bigquery.SchemaField('zip', 'string', mode='REQUIRED'),
            bigquery.SchemaField('start_date', 'date_time', mode='REQUIRED'),
            bigquery.SchemaField('end_date', 'date_time', mode='REQUIRED'),

        ]
    }  
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

[DEBUG][2023-01-08 13:37:22,971][4118340907:0082] : Creating bigquery client
[INFO ][2023-01-08 13:37:22,985][4118340907:0085] : Setup Completed


## Create Tickets' BQ Dataset

In [2]:
dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True)

logger.info(f"Created Tickets dataset: {dataset.full_dataset_id}")

[INFO ][2023-01-08 13:43:54,248][2665781056:0006] : Created Tickets dataset: deb-01-372112:tickets


## Load original tickets file

In [6]:
# change to match your filesystem
DATA_DIR = "./data/"
DEFAULT_RECEIPTS_FILE = os.path.join(DATA_DIR, "tickets.json")

# receipts data file name
filename = DEFAULT_RECEIPTS_FILE
logger.debug(f"attempting to process: {filename}")

# *** always perform checks first ***
# check if the file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# load into dataframe
df = pd.read_json(filename, lines=True)
logger.info(f"loaded {len(df.index)} rows from: {filename}")

# *** always perform check at the end ***
# check schema: contains all expected columns?
expected_columns = ['eticket_num', 'confirmation', 'ticket_date', 'price', 'seat', 'status', 'airline', 'origin', 'destination', 'passenger']
for col in expected_columns:
    assert col in list(df.columns), f"Data file missing required column: {col}"

# assign & remember receipts dataframe
receipts_df = df
display(receipts_df.head(n=10))

[DEBUG][2023-01-08 14:01:39,509][420054589:0007] : attempting to process: ./data/tickets.json
[INFO ][2023-01-08 14:01:39,878][420054589:0017] : loaded 4096 rows from: ./data/tickets.json


Unnamed: 0,eticket_num,confirmation,ticket_date,price,seat,status,airline,origin,destination,passenger
0,498-938211-0795,ZVFDC4,2022-03-23,723.42,31I,active,"{'name': 'China Eastern Airlines', 'iata': 'MU...",{'name': 'Montreal / Pierre Elliott Trudeau In...,{'name': 'Chicago Midway International Airport...,"{'first_name': 'Robert', 'last_name': 'Brown',..."
1,482-850738-6048,IL5GUI,2022-03-23,765.18,29B,active,"{'name': 'Hawaiian Airlines', 'iata': 'HA', 'i...","{'name': 'Longdongbao Airport', 'city': 'Guiya...","{'name': 'Ninoy Aquino International Airport',...","{'first_name': 'Laura', 'last_name': 'Kent', '..."
2,275-207321-8092,CYEFBC,2022-03-21,753.89,26I,active,"{'name': 'Wizz Air', 'iata': 'W6', 'icao': 'WZ...",{'name': 'Licenciado Gustavo Díaz Ordaz Intern...,"{'name': 'Ibiza Airport', 'city': 'Ibiza', 'co...","{'first_name': 'Lisa', 'last_name': 'Tucker', ..."
3,246-793315-3102,ZNGPC2,2022-03-22,793.89,15A,active,"{'name': 'AirAsia', 'iata': 'AK', 'icao': 'AXM...","{'name': 'El Tepual Airport', 'city': 'Puerto ...","{'name': 'Gdańsk Lech Wałęsa Airport', 'city':...","{'first_name': 'Matthew', 'last_name': 'Yates'..."
4,091-128904-1226,MGSBD9,2022-03-24,820.25,17F,active,"{'name': 'Xiamen Airlines', 'iata': 'MF', 'ica...",{'name': 'Baltimore/Washington International T...,"{'name': 'London Gatwick Airport', 'city': 'Lo...","{'first_name': 'Megan', 'last_name': 'Villanue..."
5,115-196069-8963,XFYQC0,2022-03-23,892.69,18C,active,"{'name': 'Air New Zealand', 'iata': 'NZ', 'ica...","{'name': 'Platov International Airport', 'city...",{'name': 'Cincinnati Northern Kentucky Interna...,"{'first_name': 'Sarah', 'last_name': 'Hall', '..."
6,396-673460-1326,N5UOOZ,2022-03-23,889.53,3C,active,"{'name': 'Jeju Air', 'iata': '7C', 'icao': 'JJ...",{'name': 'Winnipeg / James Armstrong Richardso...,"{'name': 'Naha Airport', 'city': 'Okinawa', 'c...","{'first_name': 'Seth', 'last_name': 'Thompson'..."
7,380-894599-8109,PAA19Y,2022-03-22,706.78,7D,active,"{'name': 'American Airlines', 'iata': 'AA', 'i...","{'name': 'Ontario International Airport', 'cit...","{'name': 'Yangon International Airport', 'city...","{'first_name': 'Jennifer', 'last_name': 'Garci..."
8,614-960971-2686,EF4BHJ,2022-03-23,486.4,24J,active,"{'name': 'Juneyao Airlines', 'iata': 'HO', 'ic...",{'name': 'Luis Munoz Marin International Airpo...,"{'name': 'Koltsovo Airport', 'city': 'Yekateri...","{'first_name': 'Becky', 'last_name': 'Clark', ..."
9,481-321233-0702,FVM9EE,2022-03-23,855.93,16A,active,"{'name': 'Royal Air Maroc', 'iata': 'AT', 'ica...","{'name': 'Edmonton International Airport', 'ci...","{'name': 'Taiyuan Wusu Airport', 'city': 'Taiy...","{'first_name': 'Ronald', 'last_name': 'Cook', ..."
