### Packages importation

In [37]:
import sqlite3
import pandas as pd
import os
import random
import numpy as np

### DataBase And Tables Creation

In [3]:
# database file names
db_name = "project_database.db"
sql_script = "script.sql"

def create_database():
    # create tables using the sql script
    print("creating database...")
    
    # delete the old database if it exists to start fresh
    # useful when testing so we don't get duplicate error messages
    if os.path.exists(db_name):
        os.remove(db_name)
    
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    with open(sql_script, 'r') as f:
        sql_commands = f.read()
    
    cursor.executescript(sql_commands)
    conn.commit()
    conn.close()
    print("tables created")


### Populating Tables

In [77]:
def populate_flights_and_others():
    conn = sqlite3.connect(db_name)
    
    # load airlines
    print("loading airlines...")
    try:
        df_airlines = pd.read_csv("data/airlines.csv")
        # only keeping columns relevant to our schema to save memory
        df_airlines = df_airlines[['IATA_CODE', 'AIRLINE']]
        df_airlines.columns = ['airline_code', 'airline_name']
        df_airlines.to_sql('AIRLINES', conn, if_exists='append', index=False)
    except Exception as e:
        print(f"error loading airlines: {e}")


    
    # load airports
    print("loading airports...")
    try:
        df_airports = pd.read_csv("data/airports.csv")
        df_airports = df_airports[['IATA_CODE','AIRPORT', 'CITY', 'STATE', 'LATITUDE', 'LONGITUDE']]
        df_airports.columns = ['iata_code','airport_name', 'city', 'state', 'latitude', 'longitude']
        df_airports.to_sql('AIRPORTS', conn, if_exists='append', index=False)
    except Exception as e:
        print(f"error loading airports: {e}")



    # load flights
    # we only read the first 100k rows to avoid memory issues
    # the full dataset is huge and this is enough for the project
    n = 5819079  # total rows
    k = int(n*0.1)  # sample 10%
    skip_rows = sorted(set(range(1, n)) - set(np.random.choice(range(1, n), size=k, replace=False)))

    print("loading flights (chunk of 10%)...")
    try:
        df_flights = pd.read_csv("data/flights.csv", low_memory=False, skiprows=skip_rows)
        
        # function to fix time format
        # the csv has weird formats like '2400' which is not valid in sql
        def format_time(x):
            if pd.isnull(x): return None
            s = str(int(x)).zfill(4)
            if s == '2400': return '23:59:00'
            return f"{s[:2]}:{s[2:]}:00"

        # preparing date and time columns
        # merging year/month/day columns into a single datetime object
        df_flights['flight_date'] = pd.to_datetime(df_flights[['YEAR', 'MONTH', 'DAY']]).dt.date
        df_flights['dep_time'] = df_flights['DEPARTURE_TIME'].apply(format_time)


        df_final = pd.DataFrame({
            'flight_date': df_flights['flight_date'],
            'flight_number': df_flights['FLIGHT_NUMBER'],
            'dep_time': df_flights['dep_time'],
            'dep_delay': df_flights['DEPARTURE_DELAY'].fillna(0),
            'cancelled': df_flights['CANCELLED'],
            'airline_code': df_flights['AIRLINE'],
            'origin_airport': df_flights['ORIGIN_AIRPORT'],
            'dest_airport': df_flights['DESTINATION_AIRPORT']
        })

        # check referential integrity
        # crucial step: we remove flights where the airport is not in our airports table
        # otherwise sqlite will throw a foreign key error
        existing_airports = set(pd.read_sql("SELECT iata_code FROM AIRPORTS", conn)['iata_code'])
        df_final = df_final[df_final['origin_airport'].isin(existing_airports)]
        df_final = df_final[df_final['dest_airport'].isin(existing_airports)]

        df_final.to_sql('FLIGHTS', conn, if_exists='append', index=False)
        print(f"{len(df_final)} flights loaded")
        
    except Exception as e:
        print(f"error loading flights: {e}")

    conn.commit()
    conn.close()

In [5]:
def process_weather_data():
    print("processing and loading weather data...")
    conn = sqlite3.connect(db_name)
    
    try:
        df_wind = pd.read_csv("data/wind_speed.csv")
        df_temp = pd.read_csv("data/temperature.csv")
        
        # dictionary to map city names to iata codes
        # weather data uses city names but our db is built on airport codes
        # we chose the main airports for these major cities
        city_to_iata = {
            'New York': 'JFK', 'Los Angeles': 'LAX', 'Chicago': 'ORD',
            'Atlanta': 'ATL', 'Dallas': 'DFW', 'Denver': 'DEN',
            'San Francisco': 'SFO', 'Seattle': 'SEA', 'Miami': 'MIA',
            'Boston': 'BOS', 'Phoenix': 'PHX', 'Detroit': 'DTW',
            'Houston': 'IAH', 'Minneapolis': 'MSP', 'Philadelphia': 'PHL'
        }

        # formatting the dataframes
        # transforming the data from wide format (columns per city) to long format (rows)
        df_wind_melted = pd.melt(df_wind, id_vars=['datetime'], var_name='City', value_name='wind_speed')
        df_temp_melted = pd.melt(df_temp, id_vars=['datetime'], var_name='City', value_name='temperature')
        
        # merging wind and temperature
        df_weather = pd.merge(df_wind_melted, df_temp_melted, on=['datetime', 'City'])
        
        # filter cities we are interested in
        # we discard cities that are not in our dictionary
        df_weather = df_weather[df_weather['City'].isin(city_to_iata.keys())].copy()
        df_weather = df_weather.dropna()
        
        df_weather['airport_code'] = df_weather['City'].map(city_to_iata)
        
        # converting kelvin to celsius
        # easier to read and analyze later
        df_weather['temperature'] = df_weather['temperature'] - 273.15 

        # removing duplicates just in case (same city same time)
        df_weather = df_weather.drop_duplicates(subset=['datetime', 'City'])

        # filtering outliers (sensor errors)
        # keeping realistic temperatures between -60 and 60 celsius
        df_weather = df_weather[
            (df_weather['temperature'] > -60) & 
            (df_weather['temperature'] < 60)
        ]

        # wind speed cannot be negative
        df_weather = df_weather[df_weather['wind_speed'] >= 0]

        df_final_weather = pd.DataFrame({
            'reading_time': pd.to_datetime(df_weather['datetime']),
            'wind_speed': df_weather['wind_speed'],
            'temperature': df_weather['temperature'],
            'airport_code': df_weather['airport_code']
        })
        
        # ensure airports exist in database
        # double checking to prevent foreign key errors with the weather table
        existing_airports = set(pd.read_sql("SELECT iata_code FROM AIRPORTS", conn)['iata_code'])
        df_final_weather = df_final_weather[df_final_weather['airport_code'].isin(existing_airports)]

        df_final_weather.to_sql('WEATHER', conn, if_exists='append', index=False)
        print(f"{len(df_final_weather)} weather records loaded")

    except Exception as e:
        print(f"error processing weather: {e}")

    conn.commit()
    conn.close()


In [None]:
def main1():
    db = sqlite3.connect('project_database.db')
    cursor = db.cursor()
    query = ("SELECT dep_delay FROM flights;"
             'SELECT dep_time FROM flights;')
    print(pd.read_sql_query(query, db))

In [79]:
create_database()
populate_flights_and_others()
process_weather_data()
print("done. database is ready")


creating database...
tables created
loading airlines...
loading airports...
loading flights (chunk of 10%)...
533481 flights loaded
processing and loading weather data...
error processing weather: [Errno 2] No such file or directory: 'data/wind_speed.csv'
done. database is ready


In [82]:
query = ('SELECT flight_date FROM flights;')
db = sqlite3.connect('project_database.db')
pd.read_sql_query(query,db)

Unnamed: 0,flight_date
0,2015-01-01
1,2015-01-01
2,2015-01-01
3,2015-01-01
4,2015-01-01
...,...
533476,2015-12-31
533477,2015-12-31
533478,2015-12-31
533479,2015-12-31


In [83]:
query = ("SELECT * FROM airports;")
pd.read_sql_query(query,db)

Unnamed: 0,iata_code,airport_name,city,state,latitude,longitude
0,ABE,Lehigh Valley International Airport,Allentown,PA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,31.53552,-84.19447
...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,44.68840,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,59.50336,-139.66023


### What is the total Number of Flights in 2015

In [64]:
query = ("SELECT COUNT(flight_id) as 'number of flights'  FROM flights;")
pd.read_sql_query(query,db)

Unnamed: 0,number of flights
0,533357


### What was the busiest day in terms of flights in the year 2015

In [115]:
query = ("""
         SELECT flight_date, MAX(number_of_flights) as number_of_flights 
         FROM (
            SELECT flight_date, COUNT(flight_id) as number_of_flights  FROM flights 
            WHERE strftime('%Y', flight_date) = '2015'
            GROUP BY flight_date 
            );""")
pd.read_sql_query(query,db)

Unnamed: 0,flight_date,number_of_flights
0,2015-03-09,1807


### What was the busiest Airport in terms of flights in the year 2015

In [102]:
query = """
    SELECT f.origin_airport,
           a.airport_name,
           COUNT(f.flight_id) AS number_of_flights
    FROM flights f
    JOIN airports a ON f.origin_airport = a.iata_code
    WHERE strftime('%Y', f.flight_date) = '2015'
    GROUP BY f.origin_airport
    ORDER BY number_of_flights DESC
    lIMIT 1;
"""
pd.read_sql_query(query, db)


Unnamed: 0,origin_airport,airport_name,number_of_flights
0,ATL,Hartsfield-Jackson Atlanta International Airport,34800


In [105]:
query = """
SELECT state,
       origin_airport,
       airport_name,
       flight_date,
       number_of_flights
FROM (
    SELECT a.state,
           f.origin_airport,
           a.airport_name,
           f.flight_date,
           COUNT(f.flight_id) AS number_of_flights
    FROM flights f
    JOIN airports a ON f.origin_airport = a.iata_code
    WHERE strftime('%Y', f.flight_date) = '2015'
    GROUP BY a.state, f.origin_airport, a.airport_name, f.flight_date
) AS t
WHERE number_of_flights = (
    SELECT MAX(sub_count)
    FROM (
        SELECT COUNT(f2.flight_id) AS sub_count
        FROM flights f2
        JOIN airports a2 ON f2.origin_airport = a2.iata_code
        WHERE strftime('%Y', f2.flight_date) = '2015'
          AND a2.state = t.state
        GROUP BY f2.origin_airport, f2.flight_date
    )
)
ORDER BY state;

"""
pd.read_sql_query(query, db)

Unnamed: 0,state,origin_airport,airport_name,flight_date,number_of_flights
0,AK,ANC,Ted Stevens Anchorage International Airport,2015-07-17,12
1,AL,BHM,Birmingham-Shuttlesworth International Airport,2015-05-29,10
2,AR,LIT,Bill and Hillary Clinton National Airport (Ada...,2015-03-04,9
3,AR,LIT,Bill and Hillary Clinton National Airport (Ada...,2015-04-30,9
4,AS,PPG,Pago Pago International Airport (Tafuna Airport),2015-01-12,1
...,...,...,...,...,...
126,WV,CRW,Yeager Airport,2015-06-21,3
127,WY,JAC,Jackson Hole Airport,2015-02-01,5
128,WY,JAC,Jackson Hole Airport,2015-06-09,5
129,WY,JAC,Jackson Hole Airport,2015-07-18,5


### Average delay time per Airline in each Airport

In [112]:
query = """
    SELECT airline_name, origin_airport,  AVG(dep_delay) AS average_departure_delay
    FROM flights f
    JOIN airports ap ON f.origin_airport = ap.iata_code
    JOIN airlines ar ON f.airline_code = ar.airline_code
    WHERE strftime('%Y', f.flight_date) = '2015'
    GROUP BY ar.airline_name, f.origin_airport, f.origin_airport
    ORDER BY airline_name , origin_airport, average_departure_delay DESC
;
"""
pd.read_sql_query(query, db)


Unnamed: 0,airline_name,origin_airport,average_departure_delay
0,Alaska Airlines Inc.,ABQ,-9.676471
1,Alaska Airlines Inc.,ADK,6.909091
2,Alaska Airlines Inc.,ADQ,0.972973
3,Alaska Airlines Inc.,AKN,0.500000
4,Alaska Airlines Inc.,ANC,0.577852
...,...,...,...
1271,Virgin America,PDX,-1.431818
1272,Virgin America,PSP,28.100000
1273,Virgin America,SAN,10.676136
1274,Virgin America,SEA,10.676856


### What are the 3 common Airports known for Canceled Flights

In [119]:
query = """
    SELECT  origin_airport, airport_name,  COUNT(cancelled) AS average_departure_delay
    FROM airports ap
    JOIN flights f ON ap.iata_code = f.origin_airport
    WHERE cancelled = 1
    GROUP BY origin_airport, airport_name
    ORDER BY average_departure_delay DESC
    LIMIT 3
    ;
"""
pd.read_sql_query(query, db)


Unnamed: 0,origin_airport,airport_name,average_departure_delay
0,ORD,Chicago O'Hare International Airport,826
1,DFW,Dallas/Fort Worth International Airport,616
2,LGA,LaGuardia Airport (Marine Air Terminal),442
