In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("airline_2m.csv", encoding="ISO-8859-1")

  df = pd.read_csv("airline_2m.csv", encoding="ISO-8859-1")


In [3]:
df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'DOT_ID_Reporting_Airline',
       'IATA_CODE_Reporting_Airline', 'Tail_Number',
       ...
       'Div4WheelsOff', 'Div4TailNum', 'Div5Airport', 'Div5AirportID',
       'Div5AirportSeqID', 'Div5WheelsOn', 'Div5TotalGTime',
       'Div5LongestGTime', 'Div5WheelsOff', 'Div5TailNum'],
      dtype='object', length=109)

## Cleaning and preprocessing (based on exploration data)

droping columns with all missing values

In [4]:
df = df.dropna(how='all', axis=1)

filling missing delay values with 0

In [5]:
for col in ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']:
    if col in df.columns:
        df[col] = df[col].fillna(0)

fillinf missing categorical values with 'unknown'

In [6]:
for col in ['CancellationCode']:
    if col in df.columns:
        df[col] = df[col].fillna("unknown")

coverting FlightDate to datetime

In [7]:
df['FlightDate'] = pd.to_datetime(df['FlightDate'])

converting categorical columns to category type

In [8]:
categorical_cols = ['Reporting_Airline', 'IATA_CODE_Reporting_Airline', 'Origin', 'Dest', 'CancellationCode']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')

standardizing column values (converting OriginState and DestState to uppercase values for consistency)

In [9]:
df['OriginState'] = df['OriginState'].str.upper()
df['DestState'] = df['DestState'].str.upper()

converting time columns from hhmm to "minutes since midnight"

In [10]:
def convert_to_minutes(time):
    if pd.isna(time) or time < 0:
        return np.nan
    time_str = str(int(time)).zfill(4)
    return int(time_str[:2]) * 60 + int (time_str[2:])

In [11]:
for col in ['CRSDepTime', 'DepTime', 'WheelsOff', 'WheelsOn', 'CRSArrTime', 'ArrTime']:
    if col in df.columns:
        df[col] = df[col].apply(convert_to_minutes)

column check after cleaning

In [12]:
df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'DOT_ID_Reporting_Airline',
       'IATA_CODE_Reporting_Airline', 'Tail_Number',
       'Flight_Number_Reporting_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'Origin', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'Dest',
       'DestCityName', 'DestState', 'DestStateFips', 'DestStateName',
       'DestWac', 'CRSDepTime', 'DepTime', 'DepDelay', 'DepDelayMinutes',
       'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut',
       'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay',
       'ArrDelayMinutes', 'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk',
       'Cancelled', 'CancellationCode', 'Diverted', 'CRSElapsedTime',
       'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 'DistanceGroup',
       'CarrierD

In [13]:
df

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum
0,1998,1,1,2,5,1998-01-02,NW,19386,NW,N297US,...,,,,,,,,,,
1,2009,2,5,28,4,2009-05-28,FL,20437,FL,N946AT,...,,,,,,,,,,
2,2013,2,6,29,6,2013-06-29,MQ,20398,MQ,N665MQ,...,,,,,,,,,,
3,2010,3,8,31,2,2010-08-31,DL,19790,DL,N6705Y,...,,,,,,,,,,
4,2006,1,1,15,7,2006-01-15,US,20355,US,N504AU,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,2008,1,3,23,7,2008-03-23,WN,19393,WN,N712SW,...,,,,,,,,,,
1999996,1999,1,1,5,2,1999-01-05,CO,19704,CO,N14308,...,,,,,,,,,,
1999997,2003,4,11,14,5,2003-11-14,US,20355,US,N528AU,...,,,,,,,,,,
1999998,2012,2,5,15,2,2012-05-15,WN,19393,WN,N281WN,...,,,,,,,,,,


saving cleaned data

In [14]:
df.to_csv("cleaned_flight_data.csv", index=False)