# Importing Necessary Libraries -

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib import rcParams
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

sb.set()
sb.set_style("white")
%matplotlib inline
rcParams['figure.figsize'] = [9,6]

# Loading the dataset -

In [20]:
data = pd.read_csv('/kaggle/input/airline-delay/Airline_Delay_Cause.csv')

In [21]:
data.head()

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2023,8,9E,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",89.0,13.0,2.25,1.6,...,0.0,5.99,2.0,1.0,1375.0,71.0,761.0,118.0,0.0,425.0
1,2023,8,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",62.0,10.0,1.97,0.04,...,0.0,7.42,0.0,1.0,799.0,218.0,1.0,62.0,0.0,518.0
2,2023,8,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",62.0,10.0,2.73,1.18,...,0.0,4.28,1.0,0.0,766.0,56.0,188.0,78.0,0.0,444.0
3,2023,8,9E,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",66.0,12.0,3.69,2.27,...,0.0,1.57,1.0,1.0,1397.0,471.0,320.0,388.0,0.0,218.0
4,2023,8,9E,Endeavor Air Inc.,ALB,"Albany, NY: Albany International",92.0,22.0,7.76,0.0,...,0.0,11.28,2.0,0.0,1530.0,628.0,0.0,134.0,0.0,768.0


In [22]:
data.columns

Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
       'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay'],
      dtype='object')

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171666 entries, 0 to 171665
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   year                 171666 non-null  int64  
 1   month                171666 non-null  int64  
 2   carrier              171666 non-null  object 
 3   carrier_name         171666 non-null  object 
 4   airport              171666 non-null  object 
 5   airport_name         171666 non-null  object 
 6   arr_flights          171426 non-null  float64
 7   arr_del15            171223 non-null  float64
 8   carrier_ct           171426 non-null  float64
 9   weather_ct           171426 non-null  float64
 10  nas_ct               171426 non-null  float64
 11  security_ct          171426 non-null  float64
 12  late_aircraft_ct     171426 non-null  float64
 13  arr_cancelled        171426 non-null  float64
 14  arr_diverted         171426 non-null  float64
 15  arr_delay        

In [37]:
# Descriptive Statistics of the data -
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,171666.0,2018.551361,2.890006,2013.0,2016.0,2019.0,2021.0,2023.0
month,171666.0,6.493633,3.440908,1.0,4.0,7.0,9.0,12.0
arr_flights,171666.0,362.161436,992.248838,1.0,50.0,100.0,249.0,21977.0
arr_del15,171666.0,66.306817,179.326421,0.0,6.0,17.0,47.0,4176.0
carrier_ct,171666.0,20.776488,50.282869,0.0,2.16,6.4,17.22,1293.91
weather_ct,171666.0,2.24776,7.309464,0.0,0.0,0.4,1.86,266.42
nas_ct,171666.0,19.359517,61.634827,0.0,1.0,3.91,11.68,1884.42
security_ct,171666.0,0.156877,0.716927,0.0,0.0,0.0,0.0,58.69
late_aircraft_ct,171666.0,23.744312,72.346254,0.0,1.24,5.0,15.23,2069.07
arr_cancelled,171666.0,7.521367,43.625035,0.0,0.0,1.0,4.0,4951.0


In [24]:
print((data.shape),'\n')
print("Shape of Airline_Delay_Cause.csv: ", data.shape)

(171666, 21) 

Shape of Airline_Delay_Cause.csv:  (171666, 21)


# Data Cleaning -
### a. Check for Missing Values
### b. Handle Missing Values
### c. Remove Duplicates

#### a. Check for Missing Values

In [25]:
missing_values = data.isnull().sum()
display(missing_values[missing_values>0].sort_values(ascending=False).to_frame(name='Missing Values'))

Unnamed: 0,Missing Values
arr_del15,443
arr_flights,240
carrier_ct,240
weather_ct,240
nas_ct,240
security_ct,240
late_aircraft_ct,240
arr_cancelled,240
arr_diverted,240
arr_delay,240


#### b. Handle Missing Values

In [26]:
num_cols = data.select_dtypes(include=np.number).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

cat_cols = data.select_dtypes(include='object').columns
data[cat_cols] = data[cat_cols].fillna(data[cat_cols].mode().iloc[0])

In [29]:
data.isnull().sum()

year                   0
month                  0
carrier                0
carrier_name           0
airport                0
airport_name           0
arr_flights            0
arr_del15              0
carrier_ct             0
weather_ct             0
nas_ct                 0
security_ct            0
late_aircraft_ct       0
arr_cancelled          0
arr_diverted           0
arr_delay              0
carrier_delay          0
weather_delay          0
nas_delay              0
security_delay         0
late_aircraft_delay    0
dtype: int64

#### c. Remove Duplicates

In [30]:
data = data.drop_duplicates()