Imports

In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd

Load datasets

In [None]:
def read_csv(filename):
    df = pd.read_csv(
        filename,
        header=0,                     # Use the first row as the header
        delimiter=';',                # Use semicolon as the main delimiter
        decimal=',',                  # Specify that commas are used as decimals
        quotechar='"',                # Handle quotes around strings
        skipinitialspace=True,        # Skip spaces after delimiters
    )
    df.set_index(df.columns[0], inplace=True)
    return df

airlines = read_csv('airlines.csv')
airports = read_csv('airports.csv')
planes = read_csv('planes.csv')
flights = read_csv('flights.csv')
weather = read_csv('weather.csv')

In [None]:
def check_for_nulls(df):
    res = {}
    for c in df.columns:
        if df[df[c].isnull()].shape[0] != 0:
            res[c] = df.shape[0] - df[df[c].isnull()].shape[0]
    return res

In [None]:
print("Nullable columns in airlines:", check_for_nulls(airlines))
print("Nullable columns in airports:", check_for_nulls(airports))
print("Nullable columns in planes:", check_for_nulls(planes))
print("Nullable columns in flights:", check_for_nulls(flights))
print("Nullable columns in weather:", check_for_nulls(weather))

Clean Datasets

In [None]:
def clear_airports(df):
    # I checked that tz is null = dst is null
    df = df.dropna(subset=['tz', 'dst'])
    for idx, row in df[df['tzone'].isnull()].iterrows():
        matching_row = df[(df['tz'] == row['tz']) & pd.notnull(df['tzone'])]
        if not matching_row.empty:
            df.at[idx, 'tzone'] = matching_row['tzone'].iloc[0]
        else:
            df = df.drop(idx)
    return df

In [None]:
def clear_planes(df):
    return df.dropna()

In [None]:
def clear_flights(df):
    # I checked that dep_time is null = dep_delay is null \subset arr_time is null \subset arr_delay is null \subset air_time is null
    df = df.dropna(subset=['arr_time', 'tailnum'])

    for idx, row in df.iterrows(): # Fix american format
        df.at[idx, 'dep_time'] = int(df.at[idx, 'dep_time']) % 100 + (int(df.at[idx, 'dep_time']) // 100) * 60
        df.at[idx, 'arr_time'] = int(df.at[idx, 'arr_time']) % 100 + (int(df.at[idx, 'arr_time']) // 100) * 60
        df.at[idx, 'sched_dep_time'] = df.at[idx, 'sched_dep_time'] % 100 + (df.at[idx, 'sched_dep_time'] // 100) * 60
        df.at[idx, 'sched_arr_time'] = df.at[idx, 'sched_arr_time'] % 100 + (df.at[idx, 'sched_arr_time'] // 100) * 60

    for idx, row in df[df['air_time'].isnull()].iterrows():
        df.at[idx, 'arr_delay'] = df['arr_time'].iloc[0] - df['sched_arr_time'].iloc[0]
        df.at[idx, 'air_time'] = df['arr_time'].iloc[0] - df['dep_time'].iloc[0]
    return df

In [None]:
def clear_weather(df):
    # I checked that precip is null \subset temp is null = dewp is null = humid is null \subset pressure is null
    return df # .dropna()

In [None]:
clean_airlines = airlines
clean_airports = clear_airports(airports.copy())
clean_planes = clear_planes(planes.copy())
clean_flights = clear_flights(flights.copy())
clean_weather = clear_weather(weather.copy())

In [None]:
print("Nullable columns in airlines:", check_for_nulls(clean_airlines))
print("Nullable columns in airports:", check_for_nulls(clean_airports))
print("Nullable columns in planes:", check_for_nulls(clean_planes))
print("Nullable columns in flights:", check_for_nulls(clean_flights))
print("Nullable columns in weather:", check_for_nulls(clean_weather))

In [None]:
print("Airports old vs new:", airports.shape, clean_airports.shape)
print("Planes old vs new:", planes.shape, clean_planes.shape)
print("Flights old vs new:", flights.shape, clean_flights.shape)
print("Weather old vs new:", weather.shape, clean_weather.shape)