<a href="https://colab.research.google.com/github/D-Barradas/RAPIDS_HPO/blob/main/notebooks/download_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To use a GPU in Google Colab, you need to change the runtime type. Here's how:

Go to the "Runtime" menu at the top of the page.
Select "Change runtime type".
In the "Hardware accelerator" dropdown, choose "GPU".
Click "Save".

In [1]:
import os
import glob
import tarfile
import requests
import numpy as np
import cudf


## Data Preparation

We download the Airline dataset and save it to local directory specific by `data_dir` and `orc_name`. In this step, we also want to convert the input data into appropriate dtypes. For this, we will use the `prepare_dataset` function.

Note: To ensure that this example runs quickly on a modest machine, we default to using a small subset of the airline dataset. To use the full dataset, pass the argument `use_full_dataset=True` to the `prepare_dataset` function.

In [2]:
# Set environment variable to only see the selected GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
rng = np.random.RandomState(42)
num_rows = 2500000  # number of rows to be used in this notebook

#parent_dir = "/".join(os.getcwd().split("/")[:-1])
parent_dir = "/content/drive/MyDrive/"

data_dir = os.path.join(parent_dir, "data","airline-data")

os.makedirs(data_dir,exist_ok = True )
print(f"Data directory created at: {data_dir}")

orc_name = os.path.join(data_dir, "airline-data"+ str(num_rows) + ".orc")

orc_name_full = os.path.join(data_dir, "airline-data-full-2003.orc")

Data directory created at: /content/drive/MyDrive/data/airline-data


In [5]:
url_dict =    {
    "airline-data.2005-2015.tar.gz" : "https://diybigdata.net/?sdm_process_download=1&download_id=392",
    "airline-data.2003-2018.tar.gz" : "https://diybigdata.net/?sdm_process_download=1&download_id=821",
    "airline-data.1998-2018.tar.gz" : "https://diybigdata.net/?sdm_process_download=1&download_id=832"}


In [6]:

def download_data(key):
    url = url_dict[key]
    # Send GET request to the link
    response = requests.get(url, stream=True)

    # Check if request was successful
    if response.status_code == 200:
        # Get filename from response headers (assuming Content-Disposition header is present)
        filename = response.headers.get('Content-Disposition')
        if filename:
          filename = filename.split("=")[1].strip('"')
        else:
          # If filename unavailable, use a key as filename
          filename = key

        # Open a file for writing in binary mode
        with open(f"{parent_dir}/data/{filename}", "wb") as f:
          for chunk in response.iter_content(1024):
            # Write downloaded data in chunks
            f.write(chunk)
        print(f"File downloaded successfully: {filename}")
    else:
        print(f"Download failed with status code: {response.status_code}")

In [7]:
def extract_data(fname):
    if fname.endswith("tar.gz"):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(filter="data", path=f"{parent_dir}/data/")
        tar.close()
    elif fname.endswith("tar"):
        tar = tarfile.open(fname, "r:")
        tar.extractall(filter="data", path=f"{parent_dir}/data/")
        tar.close()

In [8]:
def extract_small_data(key):

    # Define archive filename
    archive_filename = f"{parent_dir}/data/{key}"

    # Open archive in read mode
    try:
        with tarfile.open(archive_filename, "r:gz") as tar:
            # Extract all filenames
            all_filenames = tar.getnames()
            # print (all_filenames)

            # Filter files starting with "On_Time_On_Time_Performance_2003"
            filtered_files = [f for f in all_filenames if f.split("/")[-1].startswith("On_Time_On_Time_Performance_2003")]

            if filtered_files:
                print("Extracting files:")
                for member in filtered_files:
                # Extract member (file) to output directory (if provided)
                    tar.extract(member, filter="data" ,path=f"{parent_dir}/data/")
                    print(f"- {member}")  # Print extracted filename
            else:
                print("No files starting with 'On_Time_On_Time_Performance_2003' found")
    except FileNotFoundError:
        print(f"Error: Archive '{archive_filename}' not found")


In [9]:
def prepare_dataset(use_full_dataset=False):
    all_data_frames = []

    if use_full_dataset:
        key = "airline-data.1998-2018.tar.gz"
        if os.path.isfile (f"{parent_dir}/data/{key}" )== False:
            download_data(key=key)
            extract_data(fname=f"{parent_dir}/data/{key}")
        else:
            extract_data(fname=f"{parent_dir}/data/{key}")

        for m in glob.glob(
            f"{parent_dir}/data/airline-data/On_Time_On_Time_Performance*.csv"
        ):
            all_data_frames.append(cudf.read_csv(m))
    else:
        # key = "airline-data.2005-2015.tar.gz"
        key = "airline-data.1998-2018.tar.gz"

        if os.path.isfile( f"{parent_dir}/data/{key}") == False:
            download_data(key=key)
            extract_small_data(key=key)
        else :
            extract_small_data(key=key)

        for m in glob.glob(
            f"{parent_dir}/data/airline-data/On_Time_On_Time_Performance_2003_*.csv"
        ):
            all_data_frames.append(cudf.read_csv(m))

    # colect all the dataframes
    dataset = cudf.concat(all_data_frames)

    # pass all column names to Capital
    capital_names = [x.upper() for x in dataset.columns.to_list() ]
    dataset.columns = capital_names
    # print (capital_names)


    # define the features specific for 2003 files
    input_cols = ['YEAR','MONTH','DAY_OF_MONTH','DAY_OF_WEEK','CRS_DEP_TIME','CRS_ARR_TIME',
                  'OP_UNIQUE_CARRIER','OP_CARRIER_FL_NUM','ACTUAL_ELAPSED_TIME','ORIGIN',
                  'DEST','DISTANCE','DIVERTED']

    # NOTE: Beware the columns name on the full data set for example these are the names for 2005 files
    # input_cols = ['YEAR', 'MONTH', 'DAYOFMONTH', 'DAYOFWEEK', 'CRSDEPTIME', 'CRSARRTIME',
    #               'CARRIER', 'FLIGHTNUM', 'ACTUALELAPSEDTIME' ,'ORIGIN',
    #               'DEST', 'DISTANCE','DIVERTED']




    # encode categoricals as numeric
    for col in dataset.select_dtypes(["object"]).columns:
        dataset[col] = dataset[col].astype("category").cat.codes.astype(np.int32)

    # cast all columns to int32
    for col in dataset.columns:
        dataset[col] = dataset[col].astype(np.float32)  # needed for random forest

    # define the label
    dataset["ArrDelayBinary"] = 1.0 * (dataset["ARR_DELAY"] > 10)

    # put target/label column first [ classic XGBoost standard ]
    output_cols = ["ArrDelayBinary"] + input_cols

    # select the columns of interest
    dataset = dataset[output_cols]

    # drop the nan values
    dataset.dropna(axis=0, inplace=True)

    dataset = dataset.reindex(columns=output_cols)

    # convert to ORC
    return dataset

In [10]:
%%time
df = prepare_dataset()

Extracting files:
- ./airline-data/On_Time_On_Time_Performance_2003_10.csv
- ./airline-data/On_Time_On_Time_Performance_2003_7.csv
- ./airline-data/On_Time_On_Time_Performance_2003_2.csv
- ./airline-data/On_Time_On_Time_Performance_2003_8.csv
- ./airline-data/On_Time_On_Time_Performance_2003_12.csv
- ./airline-data/On_Time_On_Time_Performance_2003_4.csv
- ./airline-data/On_Time_On_Time_Performance_2003_9.csv
- ./airline-data/On_Time_On_Time_Performance_2003_5.csv
- ./airline-data/On_Time_On_Time_Performance_2003_3.csv
- ./airline-data/On_Time_On_Time_Performance_2003_6.csv
- ./airline-data/On_Time_On_Time_Performance_2003_11.csv
- ./airline-data/On_Time_On_Time_Performance_2003_1.csv
CPU times: user 6min 41s, sys: 15.7 s, total: 6min 57s
Wall time: 7min 43s


In [11]:
df_rows = df.sample(n=num_rows,random_state=rng)

In [12]:
import pyarrow.orc as orc

In [13]:
table = df.to_arrow()
orc.write_table(table, orc_name_full)


In [14]:
table = df_rows.to_arrow()
orc.write_table(table, orc_name)