In [1]:
import os
import glob
import tarfile
import requests
import numpy as np
import cudf


In [2]:
# Set environment variable to only see the selected GPU 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
parent_dir = "/".join(os.getcwd().split("/")[:-1])

In [4]:
url_dict =    {
    "airline-data.2005-2015.tar.gz" : "https://diybigdata.net/?sdm_process_download=1&download_id=392",
    "airline-data.2003-2018.tar.gz" : "https://diybigdata.net/?sdm_process_download=1&download_id=821",
    "airline-data.1998-2018.tar.gz" : "https://diybigdata.net/?sdm_process_download=1&download_id=832"}


In [5]:

def download_data(key):
    url = url_dict[key]
    # Send GET request to the link
    response = requests.get(url, stream=True)

    # Check if request was successful
    if response.status_code == 200:
        # Get filename from response headers (assuming Content-Disposition header is present)
        filename = response.headers.get('Content-Disposition')
        if filename:
          filename = filename.split("=")[1].strip('"')
        else:
          # If filename unavailable, use a key as filename
          filename = key

        # Open a file for writing in binary mode
        with open(f"{parent_dir}/data/{filename}", "wb") as f:
          for chunk in response.iter_content(1024):
            # Write downloaded data in chunks
            f.write(chunk)
        print(f"File downloaded successfully: {filename}")
    else:
        print(f"Download failed with status code: {response.status_code}")

In [6]:
def extract_data(fname):
    if fname.endswith("tar.gz"):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(filter="data", path=f"{parent_dir}/data/")
        tar.close()
    elif fname.endswith("tar"):
        tar = tarfile.open(fname, "r:")
        tar.extractall(filter="data", path=f"{parent_dir}/data/")
        tar.close()

In [9]:
def extract_small_data():

    # Define archive filename
    archive_filename = f"{parent_dir}/data/airline-data.1998-2018.tar.gz"

    # Open archive in read mode
    try:
        with tarfile.open(archive_filename, "r:gz") as tar:
            # Extract all filenames
            all_filenames = tar.getnames()
            # print (all_filenames)

            # Filter files starting with "On_Time_On_Time_Performance_2005"
            filtered_files = [f for f in all_filenames if f.split("/")[-1].startswith("On_Time_On_Time_Performance_2003")]

            if filtered_files:
                print("Extracting files:")
                for member in filtered_files:
                # Extract member (file) to output directory (if provided)
                    tar.extract(member, filter="data" ,path=f"{parent_dir}/data/") 
                    print(f"- {member}")  # Print extracted filename
            else:
                print("No files starting with 'On_Time_On_Time_Performance_2003' found")
    except FileNotFoundError:
        print(f"Error: Archive '{archive_filename}' not found")


In [10]:
def prepare_dataset(use_full_dataset=False):
    all_data_frames = []

    if use_full_dataset:
        if os.path.isfile (f"{parent_dir}/data/airline-data.1998-2018.tar.gz" )== False:
            download_data(key="airline-data.1998-2018.tar.gz")
            extract_data(fname=f"{parent_dir}/data/airline-data.1998-2018.tar.gz")
        else:
            extract_data(fname=f"{parent_dir}/data/airline-data.1998-2018.tar.gz")

        for m in glob.glob(
            f"{parent_dir}/data/airline-data/On_Time_On_Time_Performance*.csv"
        ):
            all_data_frames.append(cudf.read_csv(m))
    else:
        if os.path.isfile( f"{parent_dir}/data/airline-data.1998-2018.tar.gz") == False:
            download_data(key="airline-data.1998-2018.tar.gz")
            extract_small_data()
        else : 
            extract_small_data()

        for m in glob.glob(
            f"{parent_dir}/data/airline-data/On_Time_On_Time_Performance_2003_*.csv"
        ):
            all_data_frames.append(cudf.read_csv(m))

    # colect all the dataframes
    dataset = cudf.concat(all_data_frames)

    # pass all column names to Capital
    capital_names = [x.upper() for x in dataset.columns.to_list() ]
    dataset.columns = capital_names
    print (capital_names)


    # define the features 
    input_cols = ['YEAR','MONTH','DAY_OF_MONTH','DAY_OF_WEEK','CRS_DEP_TIME','CRS_ARR_TIME',
                  'OP_UNIQUE_CARRIER','OP_CARRIER_FL_NUM','ACTUAL_ELAPSED_TIME','ORIGIN',
                  'DEST','DISTANCE','DIVERTED']

    # encode categoricals as numeric
    for col in dataset.select_dtypes(["object"]).columns:
        dataset[col] = dataset[col].astype("category").cat.codes.astype(np.int32)

    # cast all columns to int32
    for col in dataset.columns:
        dataset[col] = dataset[col].astype(np.float32)  # needed for random forest

    # define the label
    dataset["ArrDelayBinary"] = 1.0 * (dataset["ARR_DELAY"] > 10)

    # put target/label column first [ classic XGBoost standard ]
    output_cols = ["ArrDelayBinary"] + input_cols

    # select the columns of interest
    dataset = dataset[output_cols]

    # drop the nan values
    dataset.dropna(axis=0, inplace=True)

    dataset = dataset.reindex(columns=output_cols)
    # dataset.to_orc(orc_name)
    return dataset

In [11]:
df = prepare_dataset()

['./airline-data', './airline-data/On_Time_On_Time_Performance_2005_10.csv', './airline-data/On_Time_On_Time_Performance_2005_12.csv', './airline-data/On_Time_On_Time_Performance_1998_5.csv', './airline-data/On_Time_On_Time_Performance_2005_2.csv', './airline-data/On_Time_On_Time_Performance_1998_8.csv', './airline-data/On_Time_On_Time_Performance_2005_5.csv', './airline-data/On_Time_On_Time_Performance_2005_8.csv', './airline-data/On_Time_On_Time_Performance_1999_10.csv', './airline-data/On_Time_On_Time_Performance_2006_1.csv', './airline-data/On_Time_On_Time_Performance_2018_2.csv', './airline-data/On_Time_On_Time_Performance_1999_2.csv', './airline-data/On_Time_On_Time_Performance_2006_11.csv', './airline-data/On_Time_On_Time_Performance_2006_12.csv', './airline-data/On_Time_On_Time_Performance_2006_4.csv', './airline-data/On_Time_On_Time_Performance_2000_10.csv', './airline-data/On_Time_On_Time_Performance_2006_5.csv', './airline-data/LUT-DOT_airline_IDs.csv', './airline-data/On_Ti

In [12]:
df.describe()



Unnamed: 0,ArrDelayBinary,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,CRS_ARR_TIME,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ACTUAL_ELAPSED_TIME,ORIGIN,DEST,DISTANCE,DIVERTED
count,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0,6375689.0
mean,0.205339,2003.0,6.532758,15.73585,3.934335,577.4579,806.1868,9.294674,2814.078,122.3457,141.5992,141.4844,713.6091,0.0
std,0.403949,0.0,3.445176,8.787332,1.988135,273.5536,279.8882,5.681457,2124.432,69.25841,76.26457,76.15202,565.0007,0.0
min,0.0,2003.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,-690.0,0.0,0.0,12.0,0.0
25%,0.0,2003.0,4.0,8.0,2.0,338.0,578.0,5.0,870.0,71.0,76.0,76.0,304.0,0.0
50%,0.0,2003.0,7.0,16.0,4.0,572.0,818.0,10.0,2430.0,103.0,149.0,149.0,550.0,0.0
75%,0.0,2003.0,10.0,23.0,6.0,807.0,1038.0,15.0,4511.0,153.0,206.0,205.0,948.0,0.0
max,1.0,2003.0,12.0,31.0,7.0,1190.0,1329.0,17.0,6920.0,1233.0,281.0,281.0,4962.0,0.0


In [None]:
6375689