<a href="https://colab.research.google.com/github/AdamRidene/Zindi_Challenges_submissions/blob/main/Flight_Delay_Prediction_Zindi_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
pip install -q lightgbm xgboost

In [None]:
dataset=pd.read_csv("/content/Train (1).csv")

### Checking Data

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107833 entries, 0 to 107832
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      107833 non-null  object 
 1   DATOP   107833 non-null  object 
 2   FLTID   107833 non-null  object 
 3   DEPSTN  107833 non-null  object 
 4   ARRSTN  107833 non-null  object 
 5   STD     107833 non-null  object 
 6   STA     107833 non-null  object 
 7   STATUS  107833 non-null  object 
 8   AC      107833 non-null  object 
 9   target  107833 non-null  float64
dtypes: float64(1), object(9)
memory usage: 8.2+ MB


In [None]:
dataset.tail()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
107828,train_id_107828,2018-07-05,WKL 0000,TUN,TUN,2018-07-05 23:00:00,2018-07-06 02.00.00,SCH,TU 32AIML,0.0
107829,train_id_107829,2018-01-13,UG 0003,DJE,TUN,2018-01-13 08:00:00,2018-01-13 09.00.00,SCH,UG AT7AT7,0.0
107830,train_id_107830,2018-11-07,SGT 0000,TUN,TUN,2018-11-07 05:00:00,2018-11-07 12.50.00,SCH,TU 736IOK,0.0
107831,train_id_107831,2018-01-23,UG 0010,TUN,DJE,2018-01-23 18:00:00,2018-01-23 18.45.00,ATA,TU CR9ISA,0.0
107832,train_id_107832,2018-11-13,UG 0002,TUN,DJE,2018-11-13 06:15:00,2018-11-13 07.05.00,SCH,TU CR9ISA,0.0


### Data Preprocessing

In [None]:
dataset.isnull().sum()

Unnamed: 0,0
ID,0
DATOP,0
FLTID,0
DEPSTN,0
ARRSTN,0
STD,0
STA,0
STATUS,0
AC,0
target,0


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
class FlightDelayTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,drop_columns=None):
        self.drop_columns=drop_columns if drop_columns is not None else ['STD', 'STA', 'DATOP', 'ID']
    def transform(self, X, y=None):
        # Make a copy to avoid changing the original DataFrame
        X_ = X.copy()
        # 1. Convert to datetime
        X_['STD'] = pd.to_datetime(X_['STD'])
        # 2. Fix and convert STA
        X_['STA'] = X_['STA'].str.replace('.', ':', regex=False)
        X_['STA'] = pd.to_datetime(X_['STA'])
        # 3. Create the 'expected_duration' feature
        X_["expected_duration"] = (X_['STA'] - X_['STD']).dt.total_seconds() / 60
        # 4. Drop original and unnecessary columns
        X_.drop(columns=self.drop_columns, inplace=True)
        # 5. Convert all remaining 'object' columns to 'category'
        for col in X_.columns:
            if X_[col].dtype == 'object':
                X_[col] = X_[col].astype('category')
        return X_


    def fit(self, X, y=None):
        return self # The transformer doesn't need to learn anything from the data,so we just return self.



### Model Training

Here we will use LightGBM

In [None]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np

X_train=dataset.drop('target',axis=1)
y_train=dataset['target']

full_pipeline = Pipeline(steps=[
    ('feature_engineering', FlightDelayTransformer()),
    ('model', LGBMRegressor(random_state=42))
])

param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__num_leaves': [31, 50]
}

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1) #the higher the better
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
best_rmse = np.sqrt(-grid_search.best_score_)
print("Best RMSE found: ", best_rmse)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1508
[LightGBM] [Info] Number of data points in the train set: 107833, number of used features: 6
[LightGBM] [Info] Start training from score 48.733013
Best parameters found:  {'model__learning_rate': 0.05, 'model__n_estimators': 100, 'model__num_leaves': 31}
Best RMSE found:  116.47203733063424


Here we will use XGBRegressor

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np

X_train=dataset.drop('target',axis=1)
y_train=dataset['target']

full_pipeline_xgb = Pipeline(steps=[
    ('feature_engineering', FlightDelayTransformer()),
    ('model', XGBRegressor(random_state=42, enable_categorical=True))
])

param_grid_xgb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [3, 5, 7]
}

grid_search_xgb = GridSearchCV(full_pipeline_xgb, param_grid_xgb, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

print("Best parameters found for XGBoost: ", grid_search_xgb.best_params_)
best_rmse_xgb = np.sqrt(-grid_search_xgb.best_score_)
print("Best RMSE found for XGBoost: ", best_rmse_xgb)

Best parameters found for XGBoost:  {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 100}
Best RMSE found for XGBoost:  146.33613100325127


Here we will use CatBoost

In [None]:
pip install -q catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
X_train = dataset.drop('target',axis=1)
y_train = dataset['target']


feature_transformer = FlightDelayTransformer()
X_train_transformed = feature_transformer.fit_transform(X_train)
categorical_features_names = list(X_train_transformed.select_dtypes(include=['category', 'object']).columns)

#In the catBoost case , we need to identify explicitly the "category" dtype columns
full_pipeline_catboost = Pipeline(steps=[
    ('feature_engineering', FlightDelayTransformer()),
    ('model', CatBoostRegressor(
        cat_features=categorical_features_names,
        random_state=42,
        verbose=0) #verbose=0 tells the model to run silently
    )
])


param_grid_catboost = {
    'model__iterations': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__depth': [6, 8]
}


grid_search_catboost = GridSearchCV(full_pipeline_catboost, param_grid_catboost, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_catboost.fit(X_train, y_train)

print("Best parameters found for CatBoost: ", grid_search_catboost.best_params_)
best_rmse_catboost = np.sqrt(-grid_search_catboost.best_score_)
print("Best RMSE found for CatBoost: ", best_rmse_catboost)



Best parameters found for CatBoost:  {'model__depth': 8, 'model__iterations': 200, 'model__learning_rate': 0.05}
Best RMSE found for CatBoost:  114.52786747392142


### Testing

In [None]:
test_df=pd.read_csv("/content/Test.csv")
predictions = grid_search.predict(test_df)

print("Predictions:", predictions)

Predictions: [21.34765048 74.13684447 27.48454434 ...  1.44471374  0.28900093
 -0.37193996]


In [None]:
submission_df=pd.DataFrame({'ID':test_df['ID'],'target':predictions.round().astype(int)})
submission_df.to_csv('submission.csv',index=False)

test using catboost results

In [None]:
predictions_catboost=grid_search_catboost.predict(test_df)
submission_df_catboost=pd.DataFrame({'ID':test_df['ID'],'target':predictions_catboost.round().astype(int)})
submission_df_catboost.to_csv('submission_catboost.csv',index=False)

Test using XGBoost results

In [None]:
predictions_xgb=grid_search_xgb.predict(test_df)
submission_df_xgb=pd.DataFrame({'ID':test_df['ID'],'target':predictions_xgb.round().astype(int)})
submission_df_xgb.to_csv('submission_xgb.csv',index=False)