In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
df = pd.read_csv('../../data/processed/df_reg.csv')

In [4]:
df_cleaned = df.copy()

# Precess the data like Dris 

In [5]:
relevant_features = [
    'startingAirport',          # Categorical
    'destinationAirport',       # Categorical
    'travelDuration',           # Duration as time string, needs conversion
    'totalTravelDistance',      # Numerical
    'isNonStop',                # Binary
    'isBasicEconomy',           # Binary
    'segmentsDepartureTimeRaw',
    'segmentsCabinCode',        # Categorical for cabin class
    'segmentsAirlineName',      # Categorical
    'searchDate',               # Date, needs feature extraction
    'flightDate',               # Date, needs feature extraction
    'isRefundable',             # Binary
    'totalFare'                 # Target variable
]

In [6]:
df_cleaned = df_cleaned[relevant_features]
df_cleaned

Unnamed: 0,startingAirport,destinationAirport,travelDuration,totalTravelDistance,isNonStop,isBasicEconomy,segmentsDepartureTimeRaw,segmentsCabinCode,segmentsAirlineName,searchDate,flightDate,isRefundable,totalFare
0,ATL,EWR,PT2H5M,762.00,True,False,2022-06-09T09:00:00.000-04:00,coach,Delta,2022-04-21,2022-06-09,False,208.60
1,ATL,LAX,PT4H46M,,True,False,2022-05-22T17:41:00.000-04:00,coach,Spirit Airlines,2022-05-17,2022-05-22,False,448.59
2,ATL,ORD,PT8H5M,,False,False,2022-06-12T10:40:00.000-04:00||2022-06-12T15:3...,coach||coach,Spirit Airlines||Spirit Airlines,2022-05-02,2022-06-12,False,211.58
3,ATL,DEN,PT3H18M,1207.00,True,False,2022-06-21T20:50:00.000-04:00,coach,Delta,2022-05-12,2022-06-21,False,296.61
4,ATL,DEN,PT7H4M,2312.00,False,False,2022-05-17T09:15:00.000-04:00||2022-05-17T12:0...,coach||coach,American Airlines||American Airlines,2022-04-23,2022-05-17,False,257.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
675994,SFO,LGA,PT9H4M,2897.00,False,False,2022-06-20T22:40:00.000-07:00||2022-06-21T08:2...,coach||coach,Delta||Delta,2022-05-06,2022-06-20,False,747.60
675995,SFO,BOS,PT5H48M,2698.00,True,False,2022-04-30T14:56:00.000-07:00,coach,JetBlue Airways,2022-04-22,2022-04-30,False,484.60
675996,SFO,LAX,PT1H35M,,True,False,2022-06-10T10:51:00.000-07:00,coach,Delta,2022-05-13,2022-06-10,False,268.60
675997,SFO,DEN,PT2H33M,954.00,True,False,2022-05-27T20:14:00.000-07:00,coach,United,2022-04-22,2022-05-27,False,205.60


In [7]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 675999 entries, 0 to 675998
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   startingAirport           675999 non-null  object 
 1   destinationAirport        675999 non-null  object 
 2   travelDuration            675999 non-null  object 
 3   totalTravelDistance       627982 non-null  float64
 4   isNonStop                 675999 non-null  bool   
 5   isBasicEconomy            675999 non-null  bool   
 6   segmentsDepartureTimeRaw  675999 non-null  object 
 7   segmentsCabinCode         675999 non-null  object 
 8   segmentsAirlineName       675999 non-null  object 
 9   searchDate                675999 non-null  object 
 10  flightDate                675999 non-null  object 
 11  isRefundable              675999 non-null  bool   
 12  totalFare                 675999 non-null  float64
dtypes: bool(3), float64(2), object(8)
memory usa

In [8]:
# Function to convert duration from format 'PT#H#M' to total hours in float
def convert_duration_to_hours(duration):
    # Extract hours and minutes using regex
    hours = re.search(r'(\d+)H', duration)
    minutes = re.search(r'(\d+)M', duration)
    
    # Convert extracted values to integers, defaulting to 0 if not found
    hours = int(hours.group(1)) if hours else 0
    minutes = int(minutes.group(1)) if minutes else 0
    
    # Calculate total hours
    total_hours = hours + minutes / 60.0
    return total_hours

# Applying the conversion to the travelDuration column
df_cleaned['travelDurationHours'] = df_cleaned['travelDuration'].apply(convert_duration_to_hours)

# Output the first few rows to verify
df_cleaned[['travelDurationHours']].head()

Unnamed: 0,travelDurationHours
0,2.08
1,4.77
2,8.08
3,3.3
4,7.07


In [9]:
df_cleaned = df_cleaned.drop(columns=['travelDuration'])
df_cleaned.head()

Unnamed: 0,startingAirport,destinationAirport,totalTravelDistance,isNonStop,isBasicEconomy,segmentsDepartureTimeRaw,segmentsCabinCode,segmentsAirlineName,searchDate,flightDate,isRefundable,totalFare,travelDurationHours
0,ATL,EWR,762.0,True,False,2022-06-09T09:00:00.000-04:00,coach,Delta,2022-04-21,2022-06-09,False,208.6,2.08
1,ATL,LAX,,True,False,2022-05-22T17:41:00.000-04:00,coach,Spirit Airlines,2022-05-17,2022-05-22,False,448.59,4.77
2,ATL,ORD,,False,False,2022-06-12T10:40:00.000-04:00||2022-06-12T15:3...,coach||coach,Spirit Airlines||Spirit Airlines,2022-05-02,2022-06-12,False,211.58,8.08
3,ATL,DEN,1207.0,True,False,2022-06-21T20:50:00.000-04:00,coach,Delta,2022-05-12,2022-06-21,False,296.61,3.3
4,ATL,DEN,2312.0,False,False,2022-05-17T09:15:00.000-04:00||2022-05-17T12:0...,coach||coach,American Airlines||American Airlines,2022-04-23,2022-05-17,False,257.6,7.07


In [10]:
# Function to count the number of segments in the 'segmentsCabinCode' column
def count_segments(cabin_code):
    # Count '||' to get the number of segments, adding 1 for the total count
    return cabin_code.count('||') + 1

# Applying the function to create the new 'numSegments' feature
df_cleaned['numSegments'] = df_cleaned['segmentsCabinCode'].apply(count_segments)

# Output first few rows to verify the new feature
df_cleaned[['segmentsCabinCode', 'numSegments']].head()

Unnamed: 0,segmentsCabinCode,numSegments
0,coach,1
1,coach,1
2,coach||coach,2
3,coach,1
4,coach||coach,2


In [11]:
# Function to create a binary feature indicating if there are mixed cabin classes across segments
def has_mixed_cabin_classes(cabin_code):
    # Split by '||' and check if all elements are the same
    cabins = cabin_code.split('||')
    return 1 if len(set(cabins)) > 1 else 0

# Applying the function to create the new 'mixedCabinClasses' feature
df_cleaned['mixedCabinClasses'] = df_cleaned['segmentsCabinCode'].apply(has_mixed_cabin_classes)

# Function to create a binary feature indicating if multiple airlines are involved in the flight
def has_multiple_airlines(airline_name):
    # Split by '||' and check if all elements are the same
    airlines = airline_name.split('||')
    return 1 if len(set(airlines)) > 1 else 0

# Applying the function to create the new 'multipleAirlines' feature
df_cleaned['multipleAirlines'] = df_cleaned['segmentsAirlineName'].apply(has_multiple_airlines)

# Displaying the new features in the first 10 rows to verify
df_cleaned[['segmentsCabinCode', 'mixedCabinClasses', 'segmentsAirlineName', 'multipleAirlines']].head(10)

Unnamed: 0,segmentsCabinCode,mixedCabinClasses,segmentsAirlineName,multipleAirlines
0,coach,0,Delta,0
1,coach,0,Spirit Airlines,0
2,coach||coach,0,Spirit Airlines||Spirit Airlines,0
3,coach,0,Delta,0
4,coach||coach,0,American Airlines||American Airlines,0
5,coach||coach,0,United||United,0
6,coach||coach,0,American Airlines||American Airlines,0
7,coach||coach,0,Spirit Airlines||Spirit Airlines,0
8,coach||coach,0,American Airlines||American Airlines,0
9,coach||coach,0,United||United,0


In [12]:
# Function to categorize cabin codes based on similarity or mixed types
def categorize_cabin_code(cabin_code):
    # Split the cabin_code by '||' to get individual cabin types
    cabins = cabin_code.split('||')
    # Check if all cabin types are the same, return that cabin type if true; otherwise, return 'mixed'
    return cabins[0] if len(set(cabins)) == 1 else 'mixed'

# Applying the function to create a categorized 'cabinType' feature
df_cleaned['cabinType'] = df_cleaned['segmentsCabinCode'].apply(categorize_cabin_code)

# Displaying the first few rows to verify the new 'cabinType' feature
df_cleaned[['segmentsCabinCode', 'cabinType']].head(10)


Unnamed: 0,segmentsCabinCode,cabinType
0,coach,coach
1,coach,coach
2,coach||coach,coach
3,coach,coach
4,coach||coach,coach
5,coach||coach,coach
6,coach||coach,coach
7,coach||coach,coach
8,coach||coach,coach
9,coach||coach,coach


In [13]:
# Dropping the 'segmentsCabinCode' and 'segmentsAirlineName' features
df_cleaned = df_cleaned.drop(columns=['segmentsCabinCode', 'segmentsAirlineName'])

In [14]:
df_cleaned.head()

Unnamed: 0,startingAirport,destinationAirport,totalTravelDistance,isNonStop,isBasicEconomy,segmentsDepartureTimeRaw,searchDate,flightDate,isRefundable,totalFare,travelDurationHours,numSegments,mixedCabinClasses,multipleAirlines,cabinType
0,ATL,EWR,762.0,True,False,2022-06-09T09:00:00.000-04:00,2022-04-21,2022-06-09,False,208.6,2.08,1,0,0,coach
1,ATL,LAX,,True,False,2022-05-22T17:41:00.000-04:00,2022-05-17,2022-05-22,False,448.59,4.77,1,0,0,coach
2,ATL,ORD,,False,False,2022-06-12T10:40:00.000-04:00||2022-06-12T15:3...,2022-05-02,2022-06-12,False,211.58,8.08,2,0,0,coach
3,ATL,DEN,1207.0,True,False,2022-06-21T20:50:00.000-04:00,2022-05-12,2022-06-21,False,296.61,3.3,1,0,0,coach
4,ATL,DEN,2312.0,False,False,2022-05-17T09:15:00.000-04:00||2022-05-17T12:0...,2022-04-23,2022-05-17,False,257.6,7.07,2,0,0,coach


In [15]:
# Convert 'searchDate' and 'flightDate' to datetime format
df_cleaned['searchDate'] = pd.to_datetime(df_cleaned['searchDate'])
df_cleaned['flightDate'] = pd.to_datetime(df_cleaned['flightDate'])

# Calculate the days until departure
df_cleaned['daysUntilDeparture'] = (df_cleaned['flightDate'] - df_cleaned['searchDate']).dt.days

# Display the new feature along with search and flight dates to verify
print(df_cleaned[['searchDate', 'flightDate', 'daysUntilDeparture']].head(10))

  searchDate flightDate  daysUntilDeparture
0 2022-04-21 2022-06-09                  49
1 2022-05-17 2022-05-22                   5
2 2022-05-02 2022-06-12                  41
3 2022-05-12 2022-06-21                  40
4 2022-04-23 2022-05-17                  24
5 2022-04-17 2022-05-04                  17
6 2022-04-29 2022-05-02                   3
7 2022-05-08 2022-05-19                  11
8 2022-04-23 2022-06-16                  54
9 2022-05-04 2022-05-06                   2


In [16]:
df_cleaned = df_cleaned.drop(columns=['searchDate', 'flightDate'])

In [17]:
# Check for missing data in the cleaned dataset
missing_data_summary = df_cleaned.isnull().sum()

# Display columns with missing values only
print(missing_data_summary[missing_data_summary > 0])

totalTravelDistance    48017
dtype: int64


In [18]:
# Removing rows with missing values in 'totalTravelDistance'
df_cleaned = df_cleaned.dropna(subset=['totalTravelDistance'])

# Confirming that there are no more missing values
print("Missing values in 'totalTravelDistance' after removal:", df_cleaned['totalTravelDistance'].isnull().sum())

Missing values in 'totalTravelDistance' after removal: 0


In [19]:
# Function to extract the hour and classify it into a time block
def get_time_block(departure_time_raw):
    # Extract the first time in the series (before '||')
    first_departure_time = departure_time_raw.split('||')[0]
    # Extract hour from the first time string using regex
    match = re.search(r'T(\d{2}):', first_departure_time)
    if match:
        hour = int(match.group(1))
        # Classify hour into time blocks
        if 5 <= hour < 12:
            return 'morning'
        elif 12 <= hour < 17:
            return 'afternoon'
        elif 17 <= hour < 21:
            return 'evening'
        else:
            return 'night'
    return 'unknown'  # For any format issues or missing data

# Apply the function to create a new 'timeBlock' column
df_cleaned['timeBlock'] = df_cleaned['segmentsDepartureTimeRaw'].apply(get_time_block)

# Displaying the result
df_cleaned[['segmentsDepartureTimeRaw', 'timeBlock']].head()

Unnamed: 0,segmentsDepartureTimeRaw,timeBlock
0,2022-06-09T09:00:00.000-04:00,morning
3,2022-06-21T20:50:00.000-04:00,evening
4,2022-05-17T09:15:00.000-04:00||2022-05-17T12:0...,morning
5,2022-05-04T13:50:00.000-04:00||2022-05-04T18:0...,afternoon
6,2022-05-02T17:57:00.000-04:00||2022-05-02T21:1...,evening


In [20]:
df_cleaned['timeBlock'].value_counts()

timeBlock
morning      309263
afternoon    167766
evening       96574
night         54379
Name: count, dtype: int64

In [21]:
df_cleaned = df_cleaned.drop(columns=['segmentsDepartureTimeRaw'])

In [22]:
df = df_cleaned.copy()

In [23]:
from sklearn.preprocessing import LabelEncoder

# Combine unique values from both columns to ensure consistent encoding
all_airports = pd.concat([df['startingAirport'], df['destinationAirport']]).unique()

# Initialize and fit LabelEncoder with combined unique values
airport_encoder = LabelEncoder()
airport_encoder.fit(all_airports)

# Apply the encoder to both 'startingAirport' and 'destinationAirport'
df['startingAirport_encoded'] = airport_encoder.transform(df['startingAirport'])
df['destinationAirport_encoded'] = airport_encoder.transform(df['destinationAirport'])

# Verify encoding matches by printing first few rows of encoded columns
print(df[['startingAirport', 'startingAirport_encoded', 'destinationAirport', 'destinationAirport_encoded']].head())

  startingAirport  startingAirport_encoded destinationAirport  \
0             ATL                        0                EWR   
3             ATL                        0                DEN   
4             ATL                        0                DEN   
5             ATL                        0                BOS   
6             ATL                        0                CLT   

   destinationAirport_encoded  
0                           6  
3                           3  
4                           3  
5                           1  
6                           2  


In [24]:
# Applying label encoding for boolean features
df['isNonStop_encoded'] = df['isNonStop'].astype(int)
df['isBasicEconomy_encoded'] = df['isBasicEconomy'].astype(int)
df['isRefundable_encoded'] = df['isRefundable'].astype(int)

In [25]:
df['isNonStop_encoded'].value_counts()

isNonStop_encoded
0    436655
1    191327
Name: count, dtype: int64

In [26]:
# Initialize LabelEncoder for 'cabinType'
cabin_type_encoder = LabelEncoder()
df['cabinType_encoded'] = cabin_type_encoder.fit_transform(df['cabinType'])

# Verify the encoding by displaying the first few rows
df[['cabinType', 'cabinType_encoded']].head()

Unnamed: 0,cabinType,cabinType_encoded
0,coach,1
3,coach,1
4,coach,1
5,coach,1
6,coach,1


In [27]:
# Define a custom mapping for time blocks
time_block_mapping = {'morning': 0, 'afternoon': 1, 'evening': 2, 'night': 3}

# Apply the mapping to create the encoded 'timeBlock_encoded' column
df['timeBlock_encoded'] = df['timeBlock'].map(time_block_mapping)

# Display the encoded values to verify
print(df[['timeBlock', 'timeBlock_encoded']].head())

   timeBlock  timeBlock_encoded
0    morning                  0
3    evening                  2
4    morning                  0
5  afternoon                  1
6    evening                  2


In [28]:
# Dropping categorical features
df_num = df.drop(columns=['startingAirport', 'destinationAirport','isNonStop','isBasicEconomy','isRefundable','cabinType','timeBlock'])

In [29]:
df_num = df_num.drop(columns=['mixedCabinClasses','multipleAirlines'])

In [30]:
df_num.head()

Unnamed: 0,totalTravelDistance,totalFare,travelDurationHours,numSegments,daysUntilDeparture,startingAirport_encoded,destinationAirport_encoded,isNonStop_encoded,isBasicEconomy_encoded,isRefundable_encoded,cabinType_encoded,timeBlock_encoded
0,762.0,208.6,2.08,1,49,0,6,1,0,0,1,0
3,1207.0,296.61,3.3,1,40,0,3,1,0,0,1,2
4,2312.0,257.6,7.07,2,24,0,3,0,0,0,1,0
5,947.0,127.6,5.58,2,17,0,1,0,0,0,1,1
6,1248.0,417.6,5.42,2,3,0,2,0,0,0,1,2


In [31]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Index: 627982 entries, 0 to 675998
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   totalTravelDistance         627982 non-null  float64
 1   totalFare                   627982 non-null  float64
 2   travelDurationHours         627982 non-null  float64
 3   numSegments                 627982 non-null  int64  
 4   daysUntilDeparture          627982 non-null  int64  
 5   startingAirport_encoded     627982 non-null  int64  
 6   destinationAirport_encoded  627982 non-null  int64  
 7   isNonStop_encoded           627982 non-null  int64  
 8   isBasicEconomy_encoded      627982 non-null  int64  
 9   isRefundable_encoded        627982 non-null  int64  
 10  cabinType_encoded           627982 non-null  int64  
 11  timeBlock_encoded           627982 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 62.3 MB


In [32]:
df_num

Unnamed: 0,totalTravelDistance,totalFare,travelDurationHours,numSegments,daysUntilDeparture,startingAirport_encoded,destinationAirport_encoded,isNonStop_encoded,isBasicEconomy_encoded,isRefundable_encoded,cabinType_encoded,timeBlock_encoded
0,762.00,208.60,2.08,1,49,0,6,1,0,0,1,0
3,1207.00,296.61,3.30,1,40,0,3,1,0,0,1,2
4,2312.00,257.60,7.07,2,24,0,3,0,0,0,1,0
5,947.00,127.60,5.58,2,17,0,1,0,0,0,1,1
6,1248.00,417.60,5.42,2,3,0,2,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
675993,1468.00,458.60,3.57,1,17,15,4,1,0,0,1,0
675994,2897.00,747.60,9.07,2,45,15,10,0,0,0,1,3
675995,2698.00,484.60,5.80,1,8,15,1,1,0,0,1,1
675997,954.00,205.60,2.55,1,35,15,3,1,0,0,1,2


In [33]:
output_path = '../../data/processed/df_prep_reg.csv'
df_num.to_csv(output_path, index=False)

print("Data saved to:", output_path)

Data saved to: ../../data/processed/df_prep_reg.csv
