In [1]:
import os
import sqlite3
import pandas as pd
import numpy as np
import os
from pathlib import Path
import glob
import re
from io import BytesIO
import bz2
from datetime import datetime, time
from sql_queries import create_table_queries, drop_table_queries, planes_table_insert,  carriers_table_insert, airports_table_insert

In [2]:
def initialise_db():
    db_filename = "flights_db.db"
    
    # remove the db file if it exists
    try:
        os.remove(db_filename)
    except FileNotFoundError:
        pass
    
    # create database and open connection
    conn = sqlite3.connect(db_filename)
    cur = conn.cursor()
    
    return cur, conn

In [3]:
def drop_tables(cur, conn):
    """
    Drops each table using the queries in `drop_table_queries` list.
    """
    for query in drop_table_queries:
        cur.execute(query)
        conn.commit()

In [4]:
def create_tables(cur, conn):
    """
    Creates each table using the queries in `create_table_queries` list. 
    """
    for query in create_table_queries:
        cur.execute(query)
        conn.commit()

In [5]:
def process_csv_file(file_name, cur, insert_query):
    df = pd.read_csv(file_name)
    
    for i, row in df.iterrows():
        cur.execute(insert_query, list(row))

In [6]:
# def process_carriers_file(file_name, cur):
#     df = pd.read_csv(file_name)
    
#     for i, row in df.iterrows():
#         cur.execute(carriers_table_insert, list(row))

In [7]:
# def process_airports_file(file_name, cur):
#     df = pd.read_csv(file_name)
    
#     for i, row in df.iterrows():
#         cur.execute(airports_table_insert, list(row))

In [8]:
def process_data(cur, conn):
    """
    Description: This function is responsible for listing the files in a directory,
    and then executing the ingest process for each file according to the function
    that performs the transformation to save it to the database.

    Arguments:
        cur: the cursor object.
        conn: connection to the database.
        filepath: log data or song data file path.
        func: function that transforms the data and inserts it into the database.

    Returns:
        None
    """
    data_directory = str(Path.home()) + "/Desktop/University/data-science-and-business-analytics/programming-for-data-science/dataverse_files/"
    
    # get all files matching extension from directory
    all_files = []
    for root, dirs, files in os.walk(data_directory):
        files = glob.glob(os.path.join(root,'*'))
        for f in files :
            all_files.append(os.path.abspath(f))
            
    # get total number of files found
    num_files = len(all_files)
    print('{} files found in {}'.format(num_files, data_directory))

    # iterate over files and process
    for i, datafile in enumerate(all_files, 1):
        if datafile.endswith('plane-data.csv'):
            process_csv_file(datafile, cur, planes_table_insert)
            conn.commit()
        elif datafile.endswith('carriers.csv'):
            process_csv_file(datafile, cur, carriers_table_insert)
            conn.commit()
        elif datafile.endswith('airports.csv'):
            process_csv_file(datafile, cur, airports_table_insert)
        print('{}/{} files processed.'.format(i, num_files))

In [9]:
# initialize a new db
cur, conn = initialise_db()
create_tables(cur, conn)


# conn = sqlite3.connect("flights_db.db")
# cur = conn.cursor()

process_data(cur, conn)

conn.close()

In [13]:
conn = sqlite3.connect("flights_db.db")
cur = conn.cursor()
cur.execute('SELECT * FROM airports;')
rows = cur.fetchall()
conn.close()

3376
