In [1]:
import pandas as pd
import numpy as np
from glob import glob

pd.set_option("display.max_columns", 500)

In [2]:
!ls -GFlash ../input/flight-delay-dataset-20182022/ | grep parquet

216M -rw-r--r-- 1 nobody 216M Mar 19 14:09 Combined_Flights_2018.parquet
295M -rw-r--r-- 1 nobody 295M Mar 19 14:09 Combined_Flights_2019.parquet
175M -rw-r--r-- 1 nobody 175M Mar 19 14:08 Combined_Flights_2020.parquet
232M -rw-r--r-- 1 nobody 232M Mar 19 14:09 Combined_Flights_2021.parquet
143M -rw-r--r-- 1 nobody 143M Mar 19 14:08 Combined_Flights_2022.parquet


In [3]:
!ls -GFlash ../input/flight-delay-dataset-20182022/ | grep csv

 40K -rw-r--r-- 1 nobody  39K Mar 19 14:07 Airlines.csv
1.9G -rw-r--r-- 1 nobody 1.9G Mar 19 14:11 Combined_Flights_2018.csv
2.7G -rw-r--r-- 1 nobody 2.7G Mar 19 14:11 Combined_Flights_2019.csv
1.7G -rw-r--r-- 1 nobody 1.7G Mar 19 14:11 Combined_Flights_2020.csv
2.1G -rw-r--r-- 1 nobody 2.1G Mar 19 14:11 Combined_Flights_2021.csv
1.4G -rw-r--r-- 1 nobody 1.4G Mar 19 14:11 Combined_Flights_2022.csv


# Read in and Format Data

In [8]:
def get_time_category(deptime):
    # Преобразуем время в строку и добавляем ведущий ноль, если необходимо
    time_str = f"{int(deptime):04}"
    hours = int(time_str[:2])
    minutes = int(time_str[2:])
    
    if 6 <= hours < 12:
        return "Morning"
    elif 12 <= hours < 18:
        return "Day"
    elif 18 <= hours < 24:
        return "Evening"
    else:
        return "Night"

def target_encode(train_df, valid_df, test_df, column, target):
    # Вычисляем среднее значение целевой переменной для каждой категории в обучающей выборке
    mean_target = train_df.groupby(column)[target].mean()
    
    # Применяем кодирование к обучающей и тестовой выборкам
    train_df[column] = train_df[column].map(mean_target)
    test_df[column] = test_df[column].map(mean_target)
    valid_df[column] = valid_df[column].map(mean_target)
    
    # Заполняем пропуски (если есть категории, которые не встречались в обучающей выборке)
    train_df[column].fillna(train_df[target].mean(), inplace=True)
    test_df[column].fillna(train_df[target].mean(), inplace=True)
    valid_df[column].fillna(train_df[target].mean(), inplace=True)
    
    return train_df, valid_df, test_df, 

In [33]:
parquet_files = glob("../input/flight-delay-dataset-20182022/*.parquet")

In [66]:
column_subset = ['Month', 'DayOfWeek', 'DayofMonth', 'DOT_ID_Operating_Airline','Distance',
          'DistanceGroup', 'DestAirportSeqID','CRSDepTime', 'IATA_Code_Operating_Airline',
          'Airline', 'OriginAirportID', 'DestCityName', 'DestAirportID', 'Diverted', 'Cancelled',
          'IATA_Code_Marketing_Airline', 'DepDelayMinutes']

dfs = []
for f in parquet_files:
    dfs.append(pd.read_parquet(f, columns=column_subset))
df = pd.concat(dfs).reset_index(drop=True)

In [67]:
df = df[df['CRSDepTime'].isna() == False]

df['daytime'] = df['CRSDepTime'].apply(get_time_category)

cat_cols = ['DOT_ID_Operating_Airline', 'Airline', 'OriginAirportID','DistanceGroup', 'IATA_Code_Operating_Airline',
                'DestCityName', 'DestAirportID','IATA_Code_Marketing_Airline', 'daytime']

for c in df.columns:
    print(df[c].isna().value_counts(), end='\n\n')

False    29193782
Name: Month, dtype: int64

False    29193782
Name: DayOfWeek, dtype: int64

False    29193782
Name: DayofMonth, dtype: int64

False    29193782
Name: DOT_ID_Operating_Airline, dtype: int64

False    29193782
Name: Distance, dtype: int64

False    29193782
Name: DistanceGroup, dtype: int64

False    29193782
Name: DestAirportSeqID, dtype: int64

False    29193782
Name: CRSDepTime, dtype: int64

False    29193782
Name: IATA_Code_Operating_Airline, dtype: int64

False    29193782
Name: Airline, dtype: int64

False    29193782
Name: OriginAirportID, dtype: int64

False    29193782
Name: DestCityName, dtype: int64

False    29193782
Name: DestAirportID, dtype: int64

False    29193782
Name: Diverted, dtype: int64

False    29193782
Name: Cancelled, dtype: int64

False    29193782
Name: IATA_Code_Marketing_Airline, dtype: int64

False    28430698
True       763084
Name: DepDelayMinutes, dtype: int64

False    29193782
Name: daytime, dtype: int64



**Dropping samples with Nan values.**

In [68]:
df.dropna(how='any', inplace=True)

In [51]:
df.shape

(28430698, 18)

**Matching each sample to the delay group.**

In [69]:
df["DelayGroup"] = None

df.loc[df["DepDelayMinutes"] == 0, "DelayGroup"] = "OnTime_Early"

df.loc[
    (df["DepDelayMinutes"] > 0) & (df["DepDelayMinutes"] <= 120), "DelayGroup"
] = "Small_Delay"

df.loc[
    (df["DepDelayMinutes"] > 120) & (df["DepDelayMinutes"] <= 240), "DelayGroup"
] = "Medium_Delay"

df.loc[
df["DepDelayMinutes"] > 240 & (df["DepDelayMinutes"] <= 480), "DelayGroup"] = "Large_Delay"

df.loc[df["DepDelayMinutes"] > 480, "DelayGroup"] = "Very_Large_Delay"

df.loc[df["Cancelled"], "DelayGroup"] = "Cancelled"

**Lets encode target variable**

In [70]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['DelayGroup'] = encoder.fit_transform(df['DelayGroup'])

In [71]:
print("Categories:", encoder.classes_)
print("Matching:", dict(zip(encoder.classes_, range(len(encoder.classes_)))))

Categories: ['Cancelled' 'Large_Delay' 'OnTime_Early' 'Very_Large_Delay']
Matching: {'Cancelled': 0, 'Large_Delay': 1, 'OnTime_Early': 2, 'Very_Large_Delay': 3}


In [72]:
df['DelayGroup'].value_counts()

2    19240509
1     9136152
3       38741
0       15296
Name: DelayGroup, dtype: int64

In [77]:
X = df[['Month', 'DayOfWeek', 'DayofMonth', 'DOT_ID_Operating_Airline', 'Distance',
          'DistanceGroup', 'DestAirportSeqID', 'daytime', 'DepDelayMinutes',
          'Airline', 'OriginAirportID', 'DestCityName', 'DestAirportID', 'Diverted', 'Cancelled',
          'IATA_Code_Operating_Airline', 'IATA_Code_Marketing_Airline']]
y = df['DelayGroup']

**Split data on train, validation and test sets.**

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

**Using target encoder to encode categorical features.**

In [79]:
X_train_enc, X_valid_enc, X_test_enc = X_train, X_valid, X_test

for col in cat_cols:
    X_train_enc, X_valid_enc, X_test_enc = target_encode(X_train, X_valid, X_test, col, 'DepDelayMinutes')

X_train_enc.drop('DepDelayMinutes', axis=1, inplace=True)
X_valid_enc.drop('DepDelayMinutes', axis=1, inplace=True)
X_test_enc.drop('DepDelayMinutes', axis=1, inplace=True)

In [80]:
X_train_enc.head()

Unnamed: 0,Month,DayOfWeek,DayofMonth,DOT_ID_Operating_Airline,Distance,DistanceGroup,DestAirportSeqID,daytime,Airline,OriginAirportID,DestCityName,DestAirportID,Diverted,Cancelled,IATA_Code_Operating_Airline,IATA_Code_Marketing_Airline
16001364,9,3,12,15.807315,429.0,12.222966,1226603,14.351621,15.807315,13.131755,12.194356,12.522428,False,False,15.807315,15.007885
12931245,2,1,26,13.708226,602.0,12.817629,1129202,8.676203,13.708226,11.80927,12.922747,12.922747,False,False,13.708226,15.007885
17802687,10,6,24,13.708226,1436.0,12.631136,1449202,8.676203,13.708226,14.81044,13.280874,13.280874,False,False,13.708226,15.007885
20339395,2,5,21,13.708226,2565.0,13.419121,1477104,8.676203,13.708226,19.078779,15.520079,15.520079,False,False,13.708226,15.007885
4295015,3,3,24,10.856823,231.0,11.849335,1233904,14.351621,10.856823,11.143907,12.891093,12.891093,False,False,10.856823,10.633631
