In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV


# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read each CSV file into separate DataFrames
df1 = pd.read_csv('1996_data.csv')
df2 = pd.read_csv('2000_data.csv')
df3 = pd.read_csv('2002_data.csv')
df4 = pd.read_csv('2003_data.csv')
df5 = pd.read_csv('2007_data.csv')

# Merge the DataFrames
df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

# Save the merged DataFrame to a new CSV file
df.to_csv('merged_file.csv', index=False)

In [3]:
df.dtypes

year                   int64
month                  int64
dayofmonth             int64
dayofweek              int64
deptime              float64
crsdeptime             int64
arrtime              float64
crsarrtime             int64
uniquecarrier         object
flightnum              int64
tailnum               object
actualelapsedtime    float64
crselapsedtime       float64
airtime              float64
arrdelay             float64
depdelay             float64
origin                object
dest                  object
distance               int64
taxiin                 int64
taxiout                int64
cancelled              int64
cancellationcode      object
diverted               int64
carrierdelay         float64
weatherdelay         float64
nasdelay             float64
securitydelay        float64
lateaircraftdelay    float64
delayed               object
dtype: object

In [4]:
df.isnull().sum()

year                      0
month                     0
dayofmonth                0
dayofweek                 0
deptime                3167
crsdeptime                0
arrtime                3495
crsarrtime                0
uniquecarrier             0
flightnum                 0
tailnum                 284
actualelapsedtime      3495
crselapsedtime           23
airtime                3495
arrdelay               3495
depdelay               3167
origin                    0
dest                      0
distance                  0
taxiin                    0
taxiout                   0
cancelled                 0
cancellationcode     149124
diverted                  0
carrierdelay         102288
weatherdelay         102288
nasdelay             102288
securitydelay        102288
lateaircraftdelay    102288
delayed                   0
dtype: int64

In [5]:
#Droping columns with a large number of missing values
df.drop(columns=['carrierdelay', 'weatherdelay', 'nasdelay', 'securitydelay', 'lateaircraftdelay','cancellationcode', 'arrdelay', 'depdelay'], inplace=True)
#To Check if there are any remaining missing values
print(df.isnull().sum())

year                    0
month                   0
dayofmonth              0
dayofweek               0
deptime              3167
crsdeptime              0
arrtime              3495
crsarrtime              0
uniquecarrier           0
flightnum               0
tailnum               284
actualelapsedtime    3495
crselapsedtime         23
airtime              3495
origin                  0
dest                    0
distance                0
taxiin                  0
taxiout                 0
cancelled               0
diverted                0
delayed                 0
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values for numerical columns with the median
numerical_imputer = SimpleImputer(strategy='median')
df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])

# Impute missing values for categorical columns with the most frequent category
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# Check if there are any remaining missing values
print(df.isnull().sum())

year                 0
month                0
dayofmonth           0
dayofweek            0
deptime              0
crsdeptime           0
arrtime              0
crsarrtime           0
uniquecarrier        0
flightnum            0
tailnum              0
actualelapsedtime    0
crselapsedtime       0
airtime              0
origin               0
dest                 0
distance             0
taxiin               0
taxiout              0
cancelled            0
diverted             0
delayed              0
dtype: int64


In [20]:
df_encoded = df.copy()

label_encoder = LabelEncoder()
for column in df_encoded.columns:
    if df_encoded[column].dtype == 'object':  #To Check if the column is categorical
        df_encoded[column] = label_encoder.fit_transform(df_encoded[column])

In [21]:
df_encoded

Unnamed: 0,year,month,dayofmonth,dayofweek,deptime,crsdeptime,arrtime,crsarrtime,uniquecarrier,flightnum,...,crselapsedtime,airtime,origin,dest,distance,taxiin,taxiout,cancelled,diverted,delayed
0,1996.0,12.0,15.0,7.0,1605.0,0.0,1804.0,0.0,1,1481.0,...,114.0,80.0,19,187,595.0,23.0,16.0,0.0,0.0,1
1,1996.0,2.0,5.0,1.0,1242.0,1240.0,1533.0,1541.0,19,286.0,...,121.0,96.0,81,248,794.0,5.0,10.0,0.0,0.0,1
2,1996.0,12.0,28.0,6.0,1335.0,1330.0,1606.0,1610.0,7,1183.0,...,100.0,75.0,272,19,483.0,8.0,8.0,0.0,0.0,1
3,1996.0,12.0,10.0,2.0,857.0,0.0,1117.0,0.0,1,1689.0,...,262.0,242.0,212,269,1726.0,6.0,12.0,0.0,0.0,0
4,1996.0,10.0,5.0,6.0,1420.0,1420.0,1459.0,1505.0,21,1364.0,...,105.0,85.0,49,35,588.0,5.0,9.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,2007.0,2.0,19.0,1.0,825.0,835.0,950.0,1008.0,8,4558.0,...,93.0,56.0,104,19,331.0,17.0,12.0,0.0,0.0,0
149996,2007.0,5.0,3.0,4.0,1242.0,1246.0,1425.0,1428.0,15,5561.0,...,102.0,82.0,122,164,461.0,10.0,11.0,0.0,0.0,0
149997,2007.0,5.0,19.0,6.0,1000.0,1005.0,1058.0,1055.0,13,4220.0,...,50.0,34.0,119,211,174.0,14.0,10.0,0.0,0.0,1
149998,2007.0,6.0,17.0,7.0,2143.0,2000.0,2231.0,2055.0,13,4554.0,...,55.0,35.0,38,29,201.0,3.0,10.0,0.0,0.0,1


In [22]:
df_encoded.dtypes

year                 float64
month                float64
dayofmonth           float64
dayofweek            float64
deptime              float64
crsdeptime           float64
arrtime              float64
crsarrtime           float64
uniquecarrier          int32
flightnum            float64
tailnum                int32
actualelapsedtime    float64
crselapsedtime       float64
airtime              float64
origin                 int32
dest                   int32
distance             float64
taxiin               float64
taxiout              float64
cancelled            float64
diverted             float64
delayed                int32
dtype: object

In [23]:
df_encoded.isnull().sum()

year                 0
month                0
dayofmonth           0
dayofweek            0
deptime              0
crsdeptime           0
arrtime              0
crsarrtime           0
uniquecarrier        0
flightnum            0
tailnum              0
actualelapsedtime    0
crselapsedtime       0
airtime              0
origin               0
dest                 0
distance             0
taxiin               0
taxiout              0
cancelled            0
diverted             0
delayed              0
dtype: int64

In [24]:
#numerical_cols = ['deptime', 'arrtime', 'actualelapsedtime', 'crselapsedtime', 'airtime', 'arrdelay', 'depdelay']
#for col in numerical_cols:
    #df_encoded[col].fillna(df_encoded[col].median(), inplace=True)

In [25]:
df_encoded

Unnamed: 0,year,month,dayofmonth,dayofweek,deptime,crsdeptime,arrtime,crsarrtime,uniquecarrier,flightnum,...,crselapsedtime,airtime,origin,dest,distance,taxiin,taxiout,cancelled,diverted,delayed
0,1996.0,12.0,15.0,7.0,1605.0,0.0,1804.0,0.0,1,1481.0,...,114.0,80.0,19,187,595.0,23.0,16.0,0.0,0.0,1
1,1996.0,2.0,5.0,1.0,1242.0,1240.0,1533.0,1541.0,19,286.0,...,121.0,96.0,81,248,794.0,5.0,10.0,0.0,0.0,1
2,1996.0,12.0,28.0,6.0,1335.0,1330.0,1606.0,1610.0,7,1183.0,...,100.0,75.0,272,19,483.0,8.0,8.0,0.0,0.0,1
3,1996.0,12.0,10.0,2.0,857.0,0.0,1117.0,0.0,1,1689.0,...,262.0,242.0,212,269,1726.0,6.0,12.0,0.0,0.0,0
4,1996.0,10.0,5.0,6.0,1420.0,1420.0,1459.0,1505.0,21,1364.0,...,105.0,85.0,49,35,588.0,5.0,9.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,2007.0,2.0,19.0,1.0,825.0,835.0,950.0,1008.0,8,4558.0,...,93.0,56.0,104,19,331.0,17.0,12.0,0.0,0.0,0
149996,2007.0,5.0,3.0,4.0,1242.0,1246.0,1425.0,1428.0,15,5561.0,...,102.0,82.0,122,164,461.0,10.0,11.0,0.0,0.0,0
149997,2007.0,5.0,19.0,6.0,1000.0,1005.0,1058.0,1055.0,13,4220.0,...,50.0,34.0,119,211,174.0,14.0,10.0,0.0,0.0,1
149998,2007.0,6.0,17.0,7.0,2143.0,2000.0,2231.0,2055.0,13,4554.0,...,55.0,35.0,38,29,201.0,3.0,10.0,0.0,0.0,1


In [26]:
#Spliting the dataset into Training and Testing sets
X = df_encoded.drop(columns=['delayed'])
y = df_encoded['delayed']

In [27]:
# split the data in train validate test in 70:15:15

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=123)

In [28]:
import xgboost as xgb
# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBClassifier(
    learning_rate=0.01,
    max_depth=15,
    n_estimators=1500,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    gamma=0.1,
    objective='binary:logistic'
)

model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy Score:", accuracy)

# Calculate precision score
precision = precision_score(y_test, predictions)
print("Precision Score:", precision)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Accuracy Score: 0.9059333333333334
Precision Score: 0.9329253214638972
Confusion Matrix:
[[12087  1085]
 [ 1737 15091]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.90     13172
           1       0.93      0.90      0.91     16828

    accuracy                           0.91     30000
   macro avg       0.90      0.91      0.90     30000
weighted avg       0.91      0.91      0.91     30000



In [29]:
gradient_boost = GradientBoostingClassifier(
    n_estimators=100,       # Number of boosting stages
    learning_rate=0.1,      # Learning rate
    max_depth=3,            # Maximum depth of individual trees
    min_samples_split=2,    # Minimum number of samples required to split an internal node
    min_samples_leaf=1,     # Minimum number of samples required to be at a leaf node
    random_state=42         # Random state for reproducibility
)

# Train the classifier
gradient_boost.fit(X_train, y_train)

# Make predictions on the validation set
predictions_val = gradient_boost.predict(X_val)

# Calculate accuracy on the validation set
accuracy_val = accuracy_score(y_val, predictions_val)
print("Validation Accuracy:", accuracy_val)

# Make predictions on the testing set
predictions_test = gradient_boost.predict(X_test)

# Calculate accuracy on the testing set
accuracy_test = accuracy_score(y_test, predictions_test)
print("Test Accuracy:", accuracy_test)

# Generate and print classification report
print("Classification Report:")
print(classification_report(y_test, predictions_test))

# Generate and print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions_test))

Validation Accuracy: 0.7275294117647059
Test Accuracy: 0.7232
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.66      0.68     13172
           1       0.74      0.77      0.76     16828

    accuracy                           0.72     30000
   macro avg       0.72      0.72      0.72     30000
weighted avg       0.72      0.72      0.72     30000

Confusion Matrix:
[[ 8715  4457]
 [ 3847 12981]]


In [30]:
# Initialize the decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=123)

# Train the decision tree classifier on the training set
decision_tree.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_val = decision_tree.predict(X_val)

# Calculate the accuracy score for the validation set
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

# Fine-tune the decision tree classifier using hyperparameter tuning
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(decision_tree, param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Make predictions on the validation set using the best model
y_pred_val = grid_search.best_estimator_.predict(X_val)

# Calculate the accuracy score for the validation set using the best model
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy after hyperparameter tuning:", accuracy_val)

# Calculate the precision score for the validation set using the best model
precision_val = precision_score(y_val, y_pred_val)
print("Validation Precision:", precision_val)

# Generate confusion matrix for validation set
conf_matrix_val = confusion_matrix(y_val, y_pred_val)
print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

# Generate classification report for validation set
class_report_val = classification_report(y_val, y_pred_val)
print("Classification Report for Validation Set:")
print(class_report_val)

# Make predictions on the test set using the best model
y_pred_test = grid_search.best_estimator_.predict(X_test)

# Calculate the accuracy score for the test set using the best model
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy after hyperparameter tuning:", accuracy_test)

# Calculate the precision score for the test set using the best model
precision_test = precision_score(y_test, y_pred_test)
print("Test Precision:", precision_test)

# Generate confusion matrix for test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix for Test Set:")
print(conf_matrix_test)

# Generate classification report for test set
class_report_test = classification_report(y_test, y_pred_test)
print("Classification Report for Test Set:")
print(class_report_test)

Validation Accuracy: 0.9468235294117647
Validation Accuracy after hyperparameter tuning: 0.8855424836601307
Validation Precision: 0.9109987598969761
Confusion Matrix for Validation Set:
[[7386  933]
 [1256 9550]]
Classification Report for Validation Set:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      8319
           1       0.91      0.88      0.90     10806

    accuracy                           0.89     19125
   macro avg       0.88      0.89      0.88     19125
weighted avg       0.89      0.89      0.89     19125

Test Accuracy after hyperparameter tuning: 0.7442
Test Precision: 0.7808320039268621
Confusion Matrix for Test Set:
[[ 9600  3572]
 [ 4102 12726]]
Classification Report for Test Set:
              precision    recall  f1-score   support

           0       0.70      0.73      0.71     13172
           1       0.78      0.76      0.77     16828

    accuracy                           0.74     30000
   macro avg     

In [31]:
random_forest = RandomForestClassifier(
    n_estimators=200,       # Number of trees in the forest
    max_depth=10,            # Maximum depth of the tree
    min_samples_split=4,    # Minimum number of samples required to split an internal node
    min_samples_leaf=2,     # Minimum number of samples required to be at a leaf node
    random_state=84         # Random state for reproducibility
)

# Train the classifier
random_forest.fit(X_train, y_train)

# Make predictions on the validation set
predictions_val = random_forest.predict(X_val)

# Calculate accuracy on the validation set
accuracy_val = accuracy_score(y_val, predictions_val)
print("Validation Accuracy:", accuracy_val)

# Make predictions on the testing set
predictions_test = random_forest.predict(X_test)

# Calculate accuracy on the testing set
accuracy_test = accuracy_score(y_test, predictions_test)
print("Test Accuracy:", accuracy_test)

# Generate and print classification report
print("Classification Report:")
print(classification_report(y_test, predictions_test))

# Generate and print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions_test))

Validation Accuracy: 0.7134640522875817
Test Accuracy: 0.6969666666666666
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.58      0.63     13172
           1       0.71      0.79      0.74     16828

    accuracy                           0.70     30000
   macro avg       0.69      0.68      0.69     30000
weighted avg       0.70      0.70      0.69     30000

Confusion Matrix:
[[ 7654  5518]
 [ 3573 13255]]


In [42]:
# Initialize the Logistic Regression model with specified parameters
logistic_regression = LogisticRegression(
    C=2.0,                   # Inverse of regularization strength
    penalty='l2',            # Regularization penalty ('l1' or 'l2')
    solver='newton-cg',      # Algorithm to use in the optimization problem
    max_iter=200,            # Maximum number of iterations
    random_state=123          # Random state for reproducibility
)

# Train the model on the training set
logistic_regression.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_val = logistic_regression.predict(X_val)

# Calculate the accuracy score for the validation set
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

# Calculate the precision score for the validation set
precision_val = precision_score(y_val, y_pred_val)
print("Validation Precision:", precision_val)

# Generate confusion matrix for validation set
conf_matrix_val = confusion_matrix(y_val, y_pred_val)
print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

# Generate classification report for validation set
class_report_val = classification_report(y_val, y_pred_val)
print("Classification Report for Validation Set:")
print(class_report_val)

# Make predictions on the test set
y_pred_test = logistic_regression.predict(X_test)

# Calculate the accuracy score for the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)

# Calculate the precision score for the test set
precision_test = precision_score(y_test, y_pred_test)
print("Test Precision:", precision_test)

# Generate confusion matrix for test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix for Test Set:")
print(conf_matrix_test)

# Generate classification report for test set
class_report_test = classification_report(y_test, y_pred_test)
print("Classification Report for Test Set:")
print(class_report_test)

Validation Accuracy: 0.7194771241830066
Validation Precision: 0.747340667333394
Confusion Matrix for Validation Set:
[[5540 2779]
 [2586 8220]]
Classification Report for Validation Set:
              precision    recall  f1-score   support

           0       0.68      0.67      0.67      8319
           1       0.75      0.76      0.75     10806

    accuracy                           0.72     19125
   macro avg       0.71      0.71      0.71     19125
weighted avg       0.72      0.72      0.72     19125

Test Accuracy: 0.7183666666666667
Test Precision: 0.7445566516840815
Confusion Matrix for Test Set:
[[ 8796  4376]
 [ 4073 12755]]
Classification Report for Test Set:
              precision    recall  f1-score   support

           0       0.68      0.67      0.68     13172
           1       0.74      0.76      0.75     16828

    accuracy                           0.72     30000
   macro avg       0.71      0.71      0.71     30000
weighted avg       0.72      0.72      0.72     

### Target Data Prediction

In [43]:
data = pd.read_csv('Target_data.csv')

In [44]:
data

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Delayed
0,1995,1,20,5,644,645,957,937,UA,482,...,17.0,0,,0,,,,,,?
1,1988,1,22,5,700,700,747,755,AS,60,...,,0,,0,,,,,,?
2,2006,1,27,5,803,815,924,1000,US,103,...,9.0,0,,0,0.0,0.0,0.0,0.0,0.0,?
3,1995,1,27,5,645,645,929,937,UA,482,...,15.0,0,,0,,,,,,?
4,1988,1,29,5,700,700,750,755,AS,60,...,,0,,0,,,,,,?
5,1988,1,2,6,705,700,759,755,AS,60,...,,0,,0,,,,,,?
6,2006,1,7,6,1450,1455,1640,1650,WN,2742,...,10.0,0,,0,0.0,0.0,0.0,0.0,0.0,?
7,1995,1,7,6,648,645,938,937,UA,482,...,17.0,0,,0,,,,,,?
8,1988,1,9,6,700,700,750,755,AS,60,...,,0,,0,,,,,,?
9,1995,1,14,6,644,645,938,937,UA,482,...,15.0,0,,0,,,,,,?


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               10 non-null     int64  
 1   Month              10 non-null     int64  
 2   DayofMonth         10 non-null     int64  
 3   DayOfWeek          10 non-null     int64  
 4   DepTime            10 non-null     int64  
 5   CRSDepTime         10 non-null     int64  
 6   ArrTime            10 non-null     int64  
 7   CRSArrTime         10 non-null     int64  
 8   UniqueCarrier      10 non-null     object 
 9   FlightNum          10 non-null     int64  
 10  TailNum            6 non-null      object 
 11  ActualElapsedTime  10 non-null     int64  
 12  CRSElapsedTime     10 non-null     int64  
 13  AirTime            6 non-null      float64
 14  ArrDelay           0 non-null      float64
 15  DepDelay           0 non-null      float64
 16  Origin             10 non-nul

In [47]:
data.columns = [
    'year', 'month', 'dayofmonth', 'dayofweek', 'deptime', 'crsdeptime', 'arrtime', 'crsarrtime', 
    'uniquecarrier', 'flightnum', 'tailnum', 'actualelapsedtime', 'crselapsedtime', 'airtime', 
    'arrdelay', 'depdelay', 'origin', 'dest', 'distance', 'taxiin', 'taxiout', 'cancelled', 
    'cancellationcode', 'diverted', 'carrierdelay', 'weatherdelay', 'nasdelay', 'securitydelay', 
    'lateaircraftdelay', 'delayed'
]

In [48]:
#Droping columns with a large number of missing values
data.drop(columns=['carrierdelay', 'weatherdelay', 'nasdelay', 'securitydelay', 'lateaircraftdelay','cancellationcode', 'arrdelay', 'depdelay'], inplace=True)

In [49]:
# Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Impute missing values for numerical columns with the median
numerical_imputer = SimpleImputer(strategy='median')
data[numerical_cols] = numerical_imputer.fit_transform(data[numerical_cols])

# Impute missing values for categorical columns with the most frequent category
categorical_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

# Check if there are any remaining missing values
print(data.isnull().sum())

year                 0
month                0
dayofmonth           0
dayofweek            0
deptime              0
crsdeptime           0
arrtime              0
crsarrtime           0
uniquecarrier        0
flightnum            0
tailnum              0
actualelapsedtime    0
crselapsedtime       0
airtime              0
origin               0
dest                 0
distance             0
taxiin               0
taxiout              0
cancelled            0
diverted             0
delayed              0
dtype: int64


In [50]:
df_encoded = data.copy()

label_encoder = LabelEncoder()
for column in df_encoded.columns:
    if df_encoded[column].dtype == 'object':  #To Check if the column is categorical
        df_encoded[column] = label_encoder.fit_transform(df_encoded[column])

In [51]:
df_encoded.dtypes

year                 float64
month                float64
dayofmonth           float64
dayofweek            float64
deptime              float64
crsdeptime           float64
arrtime              float64
crsarrtime           float64
uniquecarrier          int32
flightnum            float64
tailnum                int32
actualelapsedtime    float64
crselapsedtime       float64
airtime              float64
origin                 int32
dest                   int32
distance             float64
taxiin               float64
taxiout              float64
cancelled            float64
diverted             float64
delayed                int32
dtype: object

In [52]:
df_encoded

Unnamed: 0,year,month,dayofmonth,dayofweek,deptime,crsdeptime,arrtime,crsarrtime,uniquecarrier,flightnum,...,crselapsedtime,airtime,origin,dest,distance,taxiin,taxiout,cancelled,diverted,delayed
0,1995.0,1.0,20.0,5.0,644.0,645.0,957.0,937.0,1,482.0,...,112.0,110.0,3,3,678.0,6.0,17.0,0.0,0.0,0
1,1988.0,1.0,22.0,5.0,700.0,700.0,747.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,0
2,2006.0,1.0,27.0,5.0,803.0,815.0,924.0,1000.0,2,103.0,...,105.0,67.0,0,1,481.0,5.0,9.0,0.0,0.0,0
3,1995.0,1.0,27.0,5.0,645.0,645.0,929.0,937.0,1,482.0,...,112.0,83.0,3,3,678.0,6.0,15.0,0.0,0.0,0
4,1988.0,1.0,29.0,5.0,700.0,700.0,750.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,0
5,1988.0,1.0,2.0,6.0,705.0,700.0,759.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,0
6,2006.0,1.0,7.0,6.0,1450.0,1455.0,1640.0,1650.0,3,2742.0,...,55.0,37.0,1,0,277.0,3.0,10.0,0.0,0.0,0
7,1995.0,1.0,7.0,6.0,648.0,645.0,938.0,937.0,1,482.0,...,112.0,88.0,3,3,678.0,5.0,17.0,0.0,0.0,0
8,1988.0,1.0,9.0,6.0,700.0,700.0,750.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,0
9,1995.0,1.0,14.0,6.0,644.0,645.0,938.0,937.0,1,482.0,...,112.0,94.0,3,3,678.0,5.0,15.0,0.0,0.0,0


In [115]:
#numerical_cols = ['deptime', 'arrtime', 'actualelapsedtime', 'crselapsedtime', 'airtime', 'arrdelay', 'depdelay']
#for col in numerical_cols:
    #df_encoded[col].fillna(df_encoded[col].median(), inplace=True)

In [53]:
print(df_encoded.isnull().sum())

year                 0
month                0
dayofmonth           0
dayofweek            0
deptime              0
crsdeptime           0
arrtime              0
crsarrtime           0
uniquecarrier        0
flightnum            0
tailnum              0
actualelapsedtime    0
crselapsedtime       0
airtime              0
origin               0
dest                 0
distance             0
taxiin               0
taxiout              0
cancelled            0
diverted             0
delayed              0
dtype: int64


In [54]:
#Spliting the dataset into Training and Testing sets
X = df_encoded.drop(columns=['delayed'])
y = df_encoded['delayed']

In [55]:
# Save the second trained model to a file
joblib.dump(model, 'second_xgboost_model.pkl')

print("Second model saved successfully.")

loaded_model = joblib.load('second_xgboost_model.pkl')
predictions = loaded_model.predict(X)
# Create a DataFrame with original features and predictions
predictions_df_1 = pd.DataFrame(X, columns=X.columns)  
predictions_df_1['predicted_delayed'] = predictions
predictions_df_1

Second model saved successfully.


Unnamed: 0,year,month,dayofmonth,dayofweek,deptime,crsdeptime,arrtime,crsarrtime,uniquecarrier,flightnum,...,crselapsedtime,airtime,origin,dest,distance,taxiin,taxiout,cancelled,diverted,predicted_delayed
0,1995.0,1.0,20.0,5.0,644.0,645.0,957.0,937.0,1,482.0,...,112.0,110.0,3,3,678.0,6.0,17.0,0.0,0.0,1
1,1988.0,1.0,22.0,5.0,700.0,700.0,747.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,0
2,2006.0,1.0,27.0,5.0,803.0,815.0,924.0,1000.0,2,103.0,...,105.0,67.0,0,1,481.0,5.0,9.0,0.0,0.0,0
3,1995.0,1.0,27.0,5.0,645.0,645.0,929.0,937.0,1,482.0,...,112.0,83.0,3,3,678.0,6.0,15.0,0.0,0.0,1
4,1988.0,1.0,29.0,5.0,700.0,700.0,750.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,0
5,1988.0,1.0,2.0,6.0,705.0,700.0,759.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,1
6,2006.0,1.0,7.0,6.0,1450.0,1455.0,1640.0,1650.0,3,2742.0,...,55.0,37.0,1,0,277.0,3.0,10.0,0.0,0.0,0
7,1995.0,1.0,7.0,6.0,648.0,645.0,938.0,937.0,1,482.0,...,112.0,88.0,3,3,678.0,5.0,17.0,0.0,0.0,1
8,1988.0,1.0,9.0,6.0,700.0,700.0,750.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,0
9,1995.0,1.0,14.0,6.0,644.0,645.0,938.0,937.0,1,482.0,...,112.0,94.0,3,3,678.0,5.0,15.0,0.0,0.0,1


In [58]:
predictions_df_1['predicted_delayed'] = predictions_df_1['predicted_delayed'].map({1: 'Y', 0: 'N'})

path = 'C:\\Users\\shiva\\Downloads\\'
csv_file = path + 'Predicted_flightsdelay.csv'
predictions_df_1.to_csv(csv_file, index=False)

In [59]:
predictions_df_1

Unnamed: 0,year,month,dayofmonth,dayofweek,deptime,crsdeptime,arrtime,crsarrtime,uniquecarrier,flightnum,...,crselapsedtime,airtime,origin,dest,distance,taxiin,taxiout,cancelled,diverted,predicted_delayed
0,1995.0,1.0,20.0,5.0,644.0,645.0,957.0,937.0,1,482.0,...,112.0,110.0,3,3,678.0,6.0,17.0,0.0,0.0,Y
1,1988.0,1.0,22.0,5.0,700.0,700.0,747.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,N
2,2006.0,1.0,27.0,5.0,803.0,815.0,924.0,1000.0,2,103.0,...,105.0,67.0,0,1,481.0,5.0,9.0,0.0,0.0,N
3,1995.0,1.0,27.0,5.0,645.0,645.0,929.0,937.0,1,482.0,...,112.0,83.0,3,3,678.0,6.0,15.0,0.0,0.0,Y
4,1988.0,1.0,29.0,5.0,700.0,700.0,750.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,N
5,1988.0,1.0,2.0,6.0,705.0,700.0,759.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,Y
6,2006.0,1.0,7.0,6.0,1450.0,1455.0,1640.0,1650.0,3,2742.0,...,55.0,37.0,1,0,277.0,3.0,10.0,0.0,0.0,N
7,1995.0,1.0,7.0,6.0,648.0,645.0,938.0,937.0,1,482.0,...,112.0,88.0,3,3,678.0,5.0,17.0,0.0,0.0,Y
8,1988.0,1.0,9.0,6.0,700.0,700.0,750.0,755.0,0,60.0,...,55.0,85.5,2,2,0.0,5.0,15.0,0.0,0.0,N
9,1995.0,1.0,14.0,6.0,644.0,645.0,938.0,937.0,1,482.0,...,112.0,94.0,3,3,678.0,5.0,15.0,0.0,0.0,Y


In [64]:
predictions_df = predictions_df_1.copy()
predictions_df['depdelay'] = predictions_df['deptime'] - predictions_df['crsdeptime']
predictions_df['arrdelay'] = predictions_df['arrtime'] - predictions_df['crsarrtime']
predictions_df

Unnamed: 0,year,month,dayofmonth,dayofweek,deptime,crsdeptime,arrtime,crsarrtime,uniquecarrier,flightnum,...,origin,dest,distance,taxiin,taxiout,cancelled,diverted,predicted_delayed,depdelay,arrdelay
0,1995.0,1.0,20.0,5.0,644.0,645.0,957.0,937.0,1,482.0,...,3,3,678.0,6.0,17.0,0.0,0.0,Y,-1.0,20.0
1,1988.0,1.0,22.0,5.0,700.0,700.0,747.0,755.0,0,60.0,...,2,2,0.0,5.0,15.0,0.0,0.0,N,0.0,-8.0
2,2006.0,1.0,27.0,5.0,803.0,815.0,924.0,1000.0,2,103.0,...,0,1,481.0,5.0,9.0,0.0,0.0,N,-12.0,-76.0
3,1995.0,1.0,27.0,5.0,645.0,645.0,929.0,937.0,1,482.0,...,3,3,678.0,6.0,15.0,0.0,0.0,Y,0.0,-8.0
4,1988.0,1.0,29.0,5.0,700.0,700.0,750.0,755.0,0,60.0,...,2,2,0.0,5.0,15.0,0.0,0.0,N,0.0,-5.0
5,1988.0,1.0,2.0,6.0,705.0,700.0,759.0,755.0,0,60.0,...,2,2,0.0,5.0,15.0,0.0,0.0,Y,5.0,4.0
6,2006.0,1.0,7.0,6.0,1450.0,1455.0,1640.0,1650.0,3,2742.0,...,1,0,277.0,3.0,10.0,0.0,0.0,N,-5.0,-10.0
7,1995.0,1.0,7.0,6.0,648.0,645.0,938.0,937.0,1,482.0,...,3,3,678.0,5.0,17.0,0.0,0.0,Y,3.0,1.0
8,1988.0,1.0,9.0,6.0,700.0,700.0,750.0,755.0,0,60.0,...,2,2,0.0,5.0,15.0,0.0,0.0,N,0.0,-5.0
9,1995.0,1.0,14.0,6.0,644.0,645.0,938.0,937.0,1,482.0,...,3,3,678.0,5.0,15.0,0.0,0.0,Y,-1.0,1.0


In [67]:
# Select arrdelay, depdelay, and delayed columns
selected_columns = predictions_df.loc[:, ['arrdelay', 'depdelay', 'predicted_delayed']]

# Display the selected columns
selected_columns

Unnamed: 0,arrdelay,depdelay,predicted_delayed
0,20.0,-1.0,Y
1,-8.0,0.0,N
2,-76.0,-12.0,N
3,-8.0,0.0,Y
4,-5.0,0.0,N
5,4.0,5.0,Y
6,-10.0,-5.0,N
7,1.0,3.0,Y
8,-5.0,0.0,N
9,1.0,-1.0,Y
