In [1]:
import os
import pandas as pd
import os
from pathlib import Path
import glob
import re
from io import BytesIO
import bz2
from datetime import datetime, time
from sql_queries import planes_table_insert,  carriers_table_insert, airports_table_insert, on_time_table_insert
from aws_utils import *
from create_tables import *

In [2]:
def timeFloatToDatetime(timeFloat):
    if pd.notna(timeFloat):        
        hour = int(timeFloat // 100) % 24
        minute = int(timeFloat % 100)
        
        return time(hour = hour, minute = minute)
    return timeFloat

In [3]:
def process_csv_file(file_name, cur, insert_query):
    df = pd.read_csv(file_name)
    cur.executemany(insert_query, df.values.tolist())

In [4]:
def process_bz2_file(file_name, cur, insert_query):
    with bz2.open(file_name, "rb") as f:
        data = f.read()
        df = pd.read_csv(BytesIO(data), encoding='latin_1')
        cur.executemany(insert_query, df.values.tolist())
        
#         df.DepTime = df.DepTime.apply(timeFloatToDatetime)
#         df.ArrTime = df.ArrTime.apply(timeFloatToDatetime)
#         df.CRSDepTime = df.CRSDepTime.apply(timeFloatToDatetime)
#         df.CRSArrTime = df.CRSArrTime.apply(timeFloatToDatetime)

In [5]:
def process_data(cur, conn):
    """
    Description: This function is responsible for listing the files in a directory,
    and then executing the ingest process for each file according to the function
    that performs the transformation to save it to the database.

    Arguments:
        cur: the cursor object.
        conn: connection to the database.

    Returns:
        None
    """
    data_directory = str(Path.home()) + "/Desktop/University/data-science-and-business-analytics/programming-for-data-science/dataverse_files/"
    
    # get all files matching extension from directory
    csv_files = []
    for root, dirs, files in os.walk(data_directory):
        files = glob.glob(os.path.join(root,'*.csv'))
        for f in files :
            csv_files.append(os.path.abspath(f))
            
    # get all files matching extension from directory
    bz2_files = []
    for root, dirs, files in os.walk(data_directory):
        files = glob.glob(os.path.join(root,'*.bz2'))
        for f in files :
            bz2_files.append(os.path.abspath(f))

    # iterate over csv files and process
    for i, datafile in enumerate(csv_files, 1):
        if datafile.endswith('plane-data.csv'):
            process_csv_file(datafile, cur, planes_table_insert)
            conn.commit()
            print('Planes file upload successful')
        elif datafile.endswith('carriers.csv'):
            process_csv_file(datafile, cur, carriers_table_insert)
            conn.commit()
            print('Carriers file upload successful')
        elif datafile.endswith('airports.csv'):
            process_csv_file(datafile, cur, airports_table_insert)
            conn.commit()
            print('Airports file upload successful')
            
    
    # iterate over bz2 files and process
    year_re = r'200(0|1|2|3|4|5){1}.csv.bz2$'
    for i, datafile in enumerate(bz2_files, 1):
        match = re.search(year_re, datafile)
        
        if match:
            process_bz2_file(datafile, cur, on_time_table_insert)
            conn.commit()
            print("Flight file upload successful")

In [6]:
# initialize a new db
cur, conn = initialise_db()
drop_tables(cur, conn)
create_tables(cur, conn)

process_data(cur, conn)

conn.close()

upload_dbfile('airline2.db')

Planes file upload successful
Airports file upload successful
Carriers file upload successful
Flight_file_upload_successful
Flight_file_upload_successful


  df = pd.read_csv(BytesIO(data), encoding='latin_1')


Flight_file_upload_successful
Flight_file_upload_successful
Flight_file_upload_successful
Flight_file_upload_successful
