## Load Data

In [1]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from hashlib import md5
from typing import List
import json

# **** SETUP ****

# change to match your filesystem
DATA_DIR = "./data/air_travel/"
DEFAULT_RECEIPTS_FILE = os.path.join(DATA_DIR, "tickets.json")
PROJECT_NAME = "deb-dev-dw"
DATASET_NAME = "air_travel"

data = []
with open('./data/air_travel/tickets.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')

display(df.head(n=10))

Unnamed: 0,eticket_num,confirmation,ticket_date,price,seat,status,airline.name,airline.iata,airline.icao,airline.callsign,...,passenger.last_name,passenger.gender,passenger.birth_date,passenger.email,passenger.street,passenger.city,passenger.state,passenger.zip,origin,destination
0,498-938211-0795,ZVFDC4,2022-03-23,723.42,31I,active,China Eastern Airlines,MU,CES,CHINA EASTERN,...,Brown,M,1969-02-17,robert.brown.69@hotmail.com,5007 Thomas Way,Lake Hollystad,DC,20027,,
1,482-850738-6048,IL5GUI,2022-03-23,765.18,29B,active,Hawaiian Airlines,HA,HAL,HAWAIIAN,...,Kent,F,1998-08-05,laura.kent.98@hotmail.com,13991 Davis Village,North Catherineborough,PA,16516,,
2,275-207321-8092,CYEFBC,2022-03-21,753.89,26I,active,Wizz Air,W6,WZZ,WIZZ AIR,...,Tucker,F,1965-01-22,lisa.tucker.65@hotmail.com,04135 Marvin Via,North Kristabury,MA,1093,,
3,246-793315-3102,ZNGPC2,2022-03-22,793.89,15A,active,AirAsia,AK,AXM,ASIAN EXPRESS,...,Yates,NB,1975-03-31,matthew.yates.75@yahoo.com,76045 Samantha Road Suite 111,Lake Jeffrey,DE,19898,,
4,091-128904-1226,MGSBD9,2022-03-24,820.25,17F,active,Xiamen Airlines,MF,CXA,XIAMEN AIR,...,Villanueva,NB,1945-08-14,megan.villanueva.45@hotmail.com,848 Melissa Springs Suite 947,Kellerstad,TX,76177,,
5,115-196069-8963,XFYQC0,2022-03-23,892.69,18C,active,Air New Zealand,NZ,ANZ,NEW ZEALAND,...,Hall,NB,1944-08-31,sarah.hall.44@gmail.com,75420 Michael Mountains Suite 485,New Victoria,HI,96727,,
6,396-673460-1326,N5UOOZ,2022-03-23,889.53,3C,active,Jeju Air,7C,JJA,JEJU AIR,...,Thompson,M,1968-05-02,seth.thompson.68@yahoo.com,22455 Higgins Junction Apt. 042,New Keith,OR,97405,,
7,380-894599-8109,PAA19Y,2022-03-22,706.78,7D,active,American Airlines,AA,AAL,AMERICAN,...,Garcia,F,1950-02-12,jennifer.garcia.50@gmail.com,6607 Sharp Common,Chadstad,VA,22121,,
8,614-960971-2686,EF4BHJ,2022-03-23,486.4,24J,active,Juneyao Airlines,HO,DKH,JUNEYAO AIRLINES,...,Clark,F,1991-11-09,becky.clark.91@gmail.com,691 Jones Cliffs,Michaelburgh,TX,76003,,
9,481-321233-0702,FVM9EE,2022-03-23,855.93,16A,active,Royal Air Maroc,AT,RAM,ROYALAIR MAROC,...,Cook,M,1976-07-29,ronald.cook.76@hotmail.com,93328 Davis Island,Rodriguezside,MD,21408,,


## Define table schema for dimension tables

In [None]:
# setting table schema for dims tables

DIMS_TABLE_METADATA = {  
  # for airlines
      'airlines': {
        'table_name': 'airlines',
        'schema': [
            bigquery.SchemaField('airline_iata', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('airline_name', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('airline_icao', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('airline_callsign', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('airline_country', 'STRING', mode='NULLABLE'),
        ],
    },
    # for airports
      'airports': {
        'table_name': 'airports',
        'schema': [
            bigquery.SchemaField('airport_iata', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('airport_name', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('airport_city', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('airport_icao', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('airport_latitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('airport_longitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('airport_altitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('airport_tz_timezone', 'FLOAT', mode='NULLABLE'),
        ],
    },
    # for passengers
      'passengers': {
        'table_name': 'passengers',
        'schema': [
            bigquery.SchemaField('passenger_id', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('passenger_email', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('passenger_first_name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('passenger_last_name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('passenger_birth_date', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('passenger_street', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('passenger_city', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('passenger_state', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('passenger_zip', 'INTEGER', mode='NULLABLE'),
            bigquery.SchemaField('effective_start_date', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('effective_end_date', 'STRING', mode='NULLABLE'),
        ],
    },
}