# Predicting Airline Tickets Prices Using Machine Learning
Author: Chikire Aku-Ibe

In [17]:
import pandas as pd
import numpy as np
import altair_ally as aly
import altair as alt
from deepchecks.tabular.checks import (   
    OutlierSampleDetection,                                                
    IsSingleValue,             
    StringMismatch,            
    ClassImbalance,            
    FeatureLabelCorrelation,   
    FeatureFeatureCorrelation 
)
import kagglehub
from kagglehub import KaggleDatasetAdapter
from deepchecks.tabular import Dataset
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from scipy.stats import loguniform, randint, uniform
from sklearn.impute import SimpleImputer
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandera.pandas as pa
aly.alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

## Loading The Data and Data Cleaning

In [18]:


# Load the Scraped_dataset.csv directly into a pandas DataFrame
raw_dataset = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "yashdharme36/airfare-ml-predicting-flight-fares",
    "Scraped_dataset.csv"
)

# Convert date columns to datetime format
raw_dataset["Date of Booking"] = pd.to_datetime(raw_dataset["Date of Booking"])
raw_dataset["Date of Journey"] = pd.to_datetime(raw_dataset["Date of Journey"])

# Save to data/raw_data
import os
os.makedirs("../data/raw_data", exist_ok=True)
raw_dataset.to_csv("../data/raw_data/Scraped_dataset.csv")








In [19]:
raw_dataset.head()

Unnamed: 0,Date of Booking,Date of Journey,Airline-Class,Departure Time,Arrival Time,Duration,Total Stops,Price
0,2023-01-15,2023-01-16,SpiceJet \nSG-8169\nECONOMY,20:00\nDelhi,22:05\nMumbai,02h 05m,non-stop,5335
1,2023-01-15,2023-01-16,Indigo \n6E-2519\nECONOMY,23:00\nDelhi,01:20\nMumbai,02h 20m,non-stop,5899
2,2023-01-15,2023-01-16,GO FIRST \nG8- 354\nECONOMY,22:30\nDelhi,00:40\nMumbai,02h 10m,non-stop,5801
3,2023-01-15,2023-01-16,SpiceJet \nSG-8709\nECONOMY,18:50\nDelhi,20:55\nMumbai,02h 05m,non-stop,5794
4,2023-01-15,2023-01-16,Air India \nAI-805\nECONOMY,20:00\nDelhi,22:10\nMumbai,02h 10m,non-stop,5955


In [20]:
# Rename columns
raw_dataset = raw_dataset.rename(columns={
    "Date of Booking": "date_of_booking",
    "Date of Journey": "date_of_journey",
    "Airline-Class": "airline_class",
    "Departure Time": "departure_time",
    "Arrival Time": "arrival_time",
    "Duration": "duration",
    "Total Stops": "total_stops",
    "Price": "price"
})

raw_dataset.head()

Unnamed: 0,date_of_booking,date_of_journey,airline_class,departure_time,arrival_time,duration,total_stops,price
0,2023-01-15,2023-01-16,SpiceJet \nSG-8169\nECONOMY,20:00\nDelhi,22:05\nMumbai,02h 05m,non-stop,5335
1,2023-01-15,2023-01-16,Indigo \n6E-2519\nECONOMY,23:00\nDelhi,01:20\nMumbai,02h 20m,non-stop,5899
2,2023-01-15,2023-01-16,GO FIRST \nG8- 354\nECONOMY,22:30\nDelhi,00:40\nMumbai,02h 10m,non-stop,5801
3,2023-01-15,2023-01-16,SpiceJet \nSG-8709\nECONOMY,18:50\nDelhi,20:55\nMumbai,02h 05m,non-stop,5794
4,2023-01-15,2023-01-16,Air India \nAI-805\nECONOMY,20:00\nDelhi,22:10\nMumbai,02h 10m,non-stop,5955


In [21]:
# Extract day of flight from date_of_journey
raw_dataset["day_of_flight"] = raw_dataset["date_of_journey"].dt.day_name()

# View the result
raw_dataset[["date_of_journey", "day_of_flight"]].head()

Unnamed: 0,date_of_journey,day_of_flight
0,2023-01-16,Monday
1,2023-01-16,Monday
2,2023-01-16,Monday
3,2023-01-16,Monday
4,2023-01-16,Monday


In [22]:
# Split by newline characters (handles both " \n" and "\n")
split_result = raw_dataset["airline_class"].str.split(r'\s*\n\s*', expand=True)

# Assign to new columns
raw_dataset[["airline", "flight_code", "class"]] = split_result

# Drop the original "Airline-Class" column
raw_dataset = raw_dataset.drop(columns=["airline_class"])

# View the result
raw_dataset.head()

Unnamed: 0,date_of_booking,date_of_journey,departure_time,arrival_time,duration,total_stops,price,day_of_flight,airline,flight_code,class
0,2023-01-15,2023-01-16,20:00\nDelhi,22:05\nMumbai,02h 05m,non-stop,5335,Monday,SpiceJet,SG-8169,ECONOMY
1,2023-01-15,2023-01-16,23:00\nDelhi,01:20\nMumbai,02h 20m,non-stop,5899,Monday,Indigo,6E-2519,ECONOMY
2,2023-01-15,2023-01-16,22:30\nDelhi,00:40\nMumbai,02h 10m,non-stop,5801,Monday,GO FIRST,G8- 354,ECONOMY
3,2023-01-15,2023-01-16,18:50\nDelhi,20:55\nMumbai,02h 05m,non-stop,5794,Monday,SpiceJet,SG-8709,ECONOMY
4,2023-01-15,2023-01-16,20:00\nDelhi,22:10\nMumbai,02h 10m,non-stop,5955,Monday,Air India,AI-805,ECONOMY


In [23]:
raw_dataset[["departure", "source"]] = raw_dataset["departure_time"].str.split(r'\s*\n\s*', expand=True)
raw_dataset = raw_dataset.drop(columns=["departure_time"])

raw_dataset[["arrival", "destination"]] = raw_dataset["arrival_time"].str.split(r'\s*\n\s*', expand=True)
raw_dataset = raw_dataset.drop(columns=["arrival_time"])

# Convert departure and arrival to datetime (time format)
raw_dataset["departure"] = pd.to_datetime(raw_dataset["departure"], format='%H:%M').dt.time
raw_dataset["arrival"] = pd.to_datetime(raw_dataset["arrival"], format='%H:%M').dt.time

raw_dataset.head()

Unnamed: 0,date_of_booking,date_of_journey,duration,total_stops,price,day_of_flight,airline,flight_code,class,departure,source,arrival,destination
0,2023-01-15,2023-01-16,02h 05m,non-stop,5335,Monday,SpiceJet,SG-8169,ECONOMY,20:00:00,Delhi,22:05:00,Mumbai
1,2023-01-15,2023-01-16,02h 20m,non-stop,5899,Monday,Indigo,6E-2519,ECONOMY,23:00:00,Delhi,01:20:00,Mumbai
2,2023-01-15,2023-01-16,02h 10m,non-stop,5801,Monday,GO FIRST,G8- 354,ECONOMY,22:30:00,Delhi,00:40:00,Mumbai
3,2023-01-15,2023-01-16,02h 05m,non-stop,5794,Monday,SpiceJet,SG-8709,ECONOMY,18:50:00,Delhi,20:55:00,Mumbai
4,2023-01-15,2023-01-16,02h 10m,non-stop,5955,Monday,Air India,AI-805,ECONOMY,20:00:00,Delhi,22:10:00,Mumbai


In [24]:
# Function to categorize time into periods
def categorize_time(time_obj):
    if time_obj is None:
        return None
    hour = time_obj.hour
    if 6 <= hour < 12:
        return "6 AM - 12 PM"
    elif 12 <= hour < 18:
        return "12 PM - 6 PM"
    else:  # 18-23 and 0-5
        return "others"

# Create new columns for time periods
raw_dataset["departure_period"] = raw_dataset["departure"].apply(categorize_time)
raw_dataset["arrival_period"] = raw_dataset["arrival"].apply(categorize_time)
raw_dataset = raw_dataset.drop(columns=["arrival", "departure"])
# View the result
raw_dataset.head()

Unnamed: 0,date_of_booking,date_of_journey,duration,total_stops,price,day_of_flight,airline,flight_code,class,source,destination,departure_period,arrival_period
0,2023-01-15,2023-01-16,02h 05m,non-stop,5335,Monday,SpiceJet,SG-8169,ECONOMY,Delhi,Mumbai,others,others
1,2023-01-15,2023-01-16,02h 20m,non-stop,5899,Monday,Indigo,6E-2519,ECONOMY,Delhi,Mumbai,others,others
2,2023-01-15,2023-01-16,02h 10m,non-stop,5801,Monday,GO FIRST,G8- 354,ECONOMY,Delhi,Mumbai,others,others
3,2023-01-15,2023-01-16,02h 05m,non-stop,5794,Monday,SpiceJet,SG-8709,ECONOMY,Delhi,Mumbai,others,others
4,2023-01-15,2023-01-16,02h 10m,non-stop,5955,Monday,Air India,AI-805,ECONOMY,Delhi,Mumbai,others,others


In [None]:
np.random.seed(522)
set_config(transform_output="pandas")

# create the split
airline_ticket_train, airline_ticket_test = train_test_split(
    raw_dataset, train_size=0.70, stratify=raw_dataset["price"]
)

airline_ticket_train.to_csv("../data/processed/airline_ticket_train.csv")
airline_ticket_test.to_csv("../data/processed/airline_ticket_test.csv")

## EDA and Visualization