Imports

In [None]:
from pandas.core.interchange.dataframe_protocol import DataFrame
!pip install -r requirements.txt

In [None]:
import pandas as pd

Load datasets

In [None]:
def read_csv(filename):
    df = pd.read_csv(
        filename,
        header=0,  # Use the first row as the header
        delimiter=';',  # Use semicolon as the main delimiter
        decimal=',',  # Specify that commas are used as decimals
        quotechar='"',  # Handle quotes around strings
        skipinitialspace=True,  # Skip spaces after delimiters
    )
    df.set_index(df.columns[0], inplace=True)
    return df

airlines = read_csv('airlines.csv')
airports = read_csv('airports.csv')
planes = read_csv('planes.csv')
flights = read_csv('flights.csv')
weather = read_csv('weather.csv')

In [None]:
def check_for_nulls(df):
    res = {}
    for c in df.columns:
        if df[df[c].isnull()].shape[0] != 0:
            res[c] = df[df[c].isnull()].shape[0]
    return res

In [None]:
print("Nullable columns in airlines:", check_for_nulls(airlines))
print("Nullable columns in airports:", check_for_nulls(airports))
print("Nullable columns in planes:", check_for_nulls(planes))
print("Nullable columns in flights:", check_for_nulls(flights))
print("Nullable columns in weather:", check_for_nulls(weather))

Clean Datasets

In [None]:
def clear_airports(df):
    # I checked that tz is null = dst is null
    df = df.dropna(subset=['tz', 'dst'])
    for idx, row in df[df['tzone'].isnull()].iterrows():
        matching_row = df[(df['tz'] == row['tz']) & pd.notnull(df['tzone'])]
        if not matching_row.empty:
            df.at[idx, 'tzone'] = matching_row['tzone'].iloc[0]
        else:
            df = df.drop(idx)
    return df

In [None]:
def clear_planes(df):
    return df.dropna()

In [None]:
def clear_flights(df):
    # I checked that dep_time is null = dep_delay is null \subset arr_time is null \subset arr_delay is null \subset air_time is null
    df = df.dropna(subset=['arr_time', 'tailnum'])
    for idx, row in df[df['air_time'].isnull()].iterrows():
        df.at[idx, 'arr_delay'] = df['arr_time'].iloc[0] - df['sched_arr_time'].iloc[0]
        df.at[idx, 'air_time'] = df['arr_time'].iloc[0] - df['dep_time'].iloc[0]
    return df

In [None]:
def clear_weather(df):
    # I checked that precip is null \subset temp is null = dewp is null = humid is null \subset pressure is null
    df['temp'] = df['temp'].fillna(df['temp'].median())
    df['dewp'] = df['dewp'].fillna(df['dewp'].median())
    df['humid'] = df['humid'].fillna(df['humid'].median())
    df['wind_speed'] = df['wind_speed'].fillna(df['wind_speed'].mean())
    return df.ffill().bfill()

In [None]:
clean_airlines = airlines
clean_airports = clear_airports(airports.copy())
clean_planes = clear_planes(planes.copy())
clean_flights = clear_flights(flights.copy())
clean_weather = clear_weather(weather.copy())

In [None]:
print("Nullable columns in airlines:", check_for_nulls(clean_airlines))
print("Nullable columns in airports:", check_for_nulls(clean_airports))
print("Nullable columns in planes:", check_for_nulls(clean_planes))
print("Nullable columns in flights:", check_for_nulls(clean_flights))
print("Nullable columns in weather:", check_for_nulls(clean_weather))

In [None]:
print("Airports old vs new:", airports.shape, clean_airports.shape)
print("Planes old vs new:", planes.shape, clean_planes.shape)
print("Flights old vs new:", flights.shape, clean_flights.shape)
print("Weather old vs new:", weather.shape, clean_weather.shape)

In [None]:
import numpy as np

all_tables = [clean_airlines, clean_airports, clean_planes, clean_flights, clean_weather]


# Gets additional value such as min / median / max etc.
def column_summary(df):
    result_df = pd.DataFrame({
        'col_name': pd.Series(dtype='str'),
        'col_dtype': pd.Series(dtype='str'),
        'num_distinct_values': pd.Series(dtype='int'),
        'min_value': pd.Series(dtype='float'),
        'max_value': pd.Series(dtype='float'),
        'median_no_na': pd.Series(dtype='float'),
        'average_no_na': pd.Series(dtype='float'),
        'average_non_zero': pd.Series(dtype='float'),
        'null_present': pd.Series(dtype='bool'),
        'nulls_num': pd.Series(dtype='int'),
        'non_nulls_num': pd.Series(dtype='int'),
        'distinct_values': pd.Series(dtype='object'),
    })

    df = df.reset_index(drop=True)

    # Loop through each column in the DataFrame
    for column in df.columns:
        # Get column dtype
        col_dtype = df[column].dtype
        # Get distinct values and their counts
        value_counts = df[column].value_counts()
        distinct_values = value_counts.index.tolist()
        # Get number of distinct values
        num_distinct_values = len(distinct_values)
        # Get min and max values
        sorted_values = sorted(distinct_values)
        min_value = sorted_values[0] if sorted_values else None
        max_value = sorted_values[-1] if sorted_values else None

        # Get median value
        non_distinct_val_list = sorted(df[column].dropna().tolist())
        len_non_d_list = len(non_distinct_val_list)
        if len(non_distinct_val_list) == 0:
            median = None
        else:
            median = non_distinct_val_list[len_non_d_list // 2]

        # Get average value if value is number
        if np.issubdtype(df[column].dtype, np.number):
            if len(non_distinct_val_list) > 0:
                average = sum(non_distinct_val_list) / len_non_d_list
                non_zero_val_list = [v for v in non_distinct_val_list if v > 0]
                average_non_zero = sum(non_zero_val_list) / len_non_d_list
            else:
                average = None
                average_non_zero = None
        else:
            average = None
            average_non_zero = None

        # Check if null values are present
        null_present = 1 if df[column].isnull().any() else 0

        # Get number of nulls and non-nulls
        num_nulls = df[column].isnull().sum()
        num_non_nulls = df[column].notnull().sum()

        # Distinct_values only take top 10 distinct values count
        top_10_d_v = value_counts.head(10).index.tolist()
        top_10_c = value_counts.head(10).tolist()
        top_10_d_v_dict = dict(zip(top_10_d_v, top_10_c))

        # Append the information to the result DataFrame
        result_df = pd.concat([result_df, pd.DataFrame(
            [{'col_name': column, 'col_dtype': col_dtype, 'num_distinct_values': num_distinct_values,
              'min_value': min_value, 'max_value': max_value,
              'median_no_na': median, 'average_no_na': average, 'average_non_zero': average_non_zero,
              'null_present': null_present, 'nulls_num': num_nulls, 'non_nulls_num': num_non_nulls,
              'distinct_values': top_10_d_v_dict}])], ignore_index=True)

    return result_df


for table in all_tables:
    summary_df = (column_summary(table))
    display(summary_df)