In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# All necessary imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from zipfile import ZipFile
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split




# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/flight-delays-fall-2018/sample_submission.csv.zip
/kaggle/input/flight-delays-fall-2018/flight_delays_train.csv.zip
/kaggle/input/flight-delays-fall-2018/flight_delays_test.csv.zip


# Load training data

In [2]:
data_path = "/kaggle/input/flight-delays-fall-2018"
train_df = pd.read_csv(f"{data_path}/flight_delays_train.csv.zip")
test_df = pd.read_csv(f"{data_path}/flight_delays_test.csv.zip")
submission = pd.read_csv(f"{data_path}/sample_submission.csv.zip")


# Preprocess data
## Some info on the data
    1. Label (Binary Y/N)): 'dep_delayed_15min'
    2. No NaNs in data
    3. All columns except 2 ('DepTime' and 'Distance') are categorical
    4. All categoricals are prepended with the string "c-"

In [3]:
def clean_categoricals(df):
    """
    This function removes the prefix 'c-' from categorical columns in the data
    """
    categorical_cols = df.select_dtypes(include="object") # categorical columns
    for c in categorical_cols:
        df[c] = df[c].apply(lambda x: x.lstrip('c-')) # remove "c-" prefix 
    return df
    
def preprocess(df):
    label_encoder = LabelEncoder()
    df = clean_categoricals(df)
    object_cols = df.select_dtypes(include=['object']).columns
    
    # Label encoding
    for col in object_cols:
        df[col] = label_encoder.fit_transform(df[col])
    return df


# Train-Test split

In [4]:
from sklearn.model_selection import train_test_split
df = preprocess(train_df)
y = df.pop('dep_delayed_15min')
print(len(df.columns))
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, random_state=42)


8


# XGBoost training

In [5]:
# Initialize the XGBoost classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the model
xgb_clf.fit(X_train, y_train)

# Make predictions
y_pred = xgb_clf.predict(X_test)

# Rsq of training test set


In [6]:
test_df = preprocess(test_df)
preds = xgb_clf.predict(test_df)
submission_df = submission = pd.DataFrame({
    'id': test_df.index,
    'dep_delayed_15min': preds
})

# Save the submission to a CSV file
i = 1
submission.to_csv(f'submission{i}.csv', index=False)

# Display the first few rows of the submission
# print(submission.head())
