In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno 
import math
from tqdm import tqdm
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet
)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import (
    KFold,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    RandomizedSearchCV,
    GridSearchCV
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    make_scorer
    
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# --- train dataset prepare ---

In [2]:
delay_reg = pd.read_csv("2024/delays_train.csv")

In [3]:
delay_reg=delay_reg[delay_reg['Arrival_Delay']<1900] 

In [4]:
delay_reg=delay_reg[delay_reg['Arrival_Delay']-delay_reg['Departure_Delay']>-61]
delay_reg=delay_reg[delay_reg['Arrival_Delay']-delay_reg['Departure_Delay']<120]

In [5]:
delay_time=delay_reg.copy()
delay_time['Scheduled_Departure_Time_min']=delay_time['Scheduled_Departure_Time']%100+np.floor(delay_time['Scheduled_Departure_Time']/100)*60
delay_time['Actual_Departure_Time_min']=delay_time['Actual_Departure_Time']%100+np.floor(delay_time['Actual_Departure_Time']/100)*60
delay_time['Departure_Delay_fornan']=delay_time['Actual_Departure_Time_min']-delay_time['Scheduled_Departure_Time_min']

In [6]:
def custom_function(x):
    if x < -62: return 24 * 60 + x
    else: return x        
delay_time['Departure_Delay_fornan'] = delay_time['Departure_Delay_fornan'].apply(custom_function)

In [7]:
delay_reg['Departure_Delay'].fillna(delay_time['Departure_Delay_fornan'],inplace=True)

In [8]:
delay_reg['Destination_Wind_Speed'] = delay_reg.groupby(by = ['Month_of_Year','Arrival_State'])['Destination_Wind_Speed'].transform(lambda x: x.fillna(x.median()))
delay_reg['Origin_Wind_Speed'] = delay_reg.groupby(by = ['Month_of_Year','Arrival_State'])['Origin_Wind_Speed'].transform(lambda x: x.fillna(x.median()))

In [9]:
delay_reg=delay_reg[['Arrival_Delay','Departure_Delay', 'Taxi_Out_Time',
        'Taxi_In_Time','Origin_Wind_Speed','Destination_Wind_Speed']].copy()

In [10]:
features=['Departure_Delay', 'Taxi_Out_Time',
        'Taxi_In_Time','Origin_Wind_Speed','Destination_Wind_Speed']

In [11]:
x_delay=delay_reg[features].copy()
y_delay=delay_reg[['Arrival_Delay']].copy()

# --- test datast preapre ---

In [12]:
delay_test = pd.read_csv("2024/delays_test.csv")

In [13]:
delay_test['ID']=range(0, len(delay_test))

In [15]:
delay_test=delay_test[delay_test['Flight_Cancelled']==False]

In [16]:
delay_time=delay_test.copy()
delay_time['Scheduled_Departure_Time_min']=delay_time['Scheduled_Departure_Time']%100+np.floor(delay_time['Scheduled_Departure_Time']/100)*60
delay_time['Actual_Departure_Time_min']=delay_time['Actual_Departure_Time']%100+np.floor(delay_time['Actual_Departure_Time']/100)*60
delay_time['Departure_Delay_fornan']=delay_time['Actual_Departure_Time_min']-delay_time['Scheduled_Departure_Time_min']

In [17]:
def custom_function(x):
    if x < -62: return 24 * 60 + x
    else: return x        
delay_time['Departure_Delay_fornan'] = delay_time['Departure_Delay_fornan'].apply(custom_function)

In [None]:
delay_test['Departure_Delay'].fillna(delay_time['Departure_Delay_fornan'],inplace=True)

In [19]:
delay_test=delay_test[delay_test['Taxi_In_Time'].isna()==False]

In [20]:
delay_test['Destination_Wind_Speed'] = delay_test.groupby(by = [ 'Month_of_Year','Arrival_State'])['Destination_Wind_Speed'].transform(lambda x: x.fillna(x.median()))
delay_test['Origin_Wind_Speed'] = delay_test.groupby(by = [ 'Month_of_Year','Arrival_State'])['Origin_Wind_Speed'].transform(lambda x: x.fillna(x.median()))

In [21]:
delay_test=delay_test[['Departure_Delay', 'Taxi_Out_Time',
        'Taxi_In_Time','Origin_Wind_Speed','Destination_Wind_Speed','ID']].copy()

# --- model ---

In [23]:
reg = LinearRegression(fit_intercept=True)
reg.fit(X=x_delay,y=y_delay)
y_pred=reg.predict(X=delay_test[['Departure_Delay', 'Taxi_Out_Time',
        'Taxi_In_Time','Origin_Wind_Speed','Destination_Wind_Speed']])

In [24]:
delay_test['Arrival_Delay_pred']=y_pred

In [25]:
final_res=delay_test[['ID','Arrival_Delay_pred']]

In [26]:
final_res.to_csv("delay_results.csv")