In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

In [3]:
df = pd.read_csv('flight-price-prediction/clean_data_v2.csv')
df.head()

Unnamed: 0,Airline,Source,Departure Time,Stops,Arrival Time,Destination,Class,Duration,Days Left,Price
0,SpiceJet,Delhi,Evening,0,Night,Mumbai,Economy,130,1,5953
1,SpiceJet,Delhi,Early_Morning,0,Morning,Mumbai,Economy,140,1,5953
2,AirAsia,Delhi,Early_Morning,0,Early_Morning,Mumbai,Economy,130,1,5956
3,Vistara,Delhi,Morning,0,Afternoon,Mumbai,Economy,135,1,5955
4,Vistara,Delhi,Morning,0,Morning,Mumbai,Economy,140,1,5955


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Airline         300153 non-null  object
 1   Source          300153 non-null  object
 2   Departure Time  300153 non-null  object
 3   Stops           300153 non-null  int64 
 4   Arrival Time    300153 non-null  object
 5   Destination     300153 non-null  object
 6   Class           300153 non-null  object
 7   Duration        300153 non-null  int64 
 8   Days Left       300153 non-null  int64 
 9   Price           300153 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 22.9+ MB


In [6]:
df['Stops'].unique()

array([0, 1, 2])

In [9]:
# Value Counts
for col in df.columns:
    print(f"\n--- {col} ---")
    print(f"Number of unique values: {df[col].nunique()}")


--- Airline ---
Number of unique values: 6

--- Source ---
Number of unique values: 6

--- Departure Time ---
Number of unique values: 6

--- Stops ---
Number of unique values: 3

--- Arrival Time ---
Number of unique values: 6

--- Destination ---
Number of unique values: 6

--- Class ---
Number of unique values: 2

--- Duration ---
Number of unique values: 476

--- Days Left ---
Number of unique values: 49

--- Price ---
Number of unique values: 12157


In [10]:
# Explicitly mapping Economy to 0 and Business to 1
class_mapping = {'Economy': 0, 'Business': 1}
df['Class'] = df['Class'].map(class_mapping)

print("Class column after encoding:")
print(df['Class'].value_counts())

Class column after encoding:
Class
0    206666
1     93487
Name: count, dtype: int64


In [23]:
# One-Hot Encoding for nominal categorical features
categorical_cols = ['Airline', 'Source', 'Destination', 'Departure Time', 'Arrival Time']

# drop_first=True avoids the "dummy variable trap"
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=int)

print("Shape after One-Hot Encoding:", df_encoded.shape)
display(df_encoded.head())

Shape after One-Hot Encoding: (300153, 30)


Unnamed: 0,Stops,Class,Duration,Days Left,Price,Airline_Air_India,Airline_GO_FIRST,Airline_Indigo,Airline_SpiceJet,Airline_Vistara,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai,Departure Time_Early_Morning,Departure Time_Evening,Departure Time_Late_Night,Departure Time_Morning,Departure Time_Night,Arrival Time_Early_Morning,Arrival Time_Evening,Arrival Time_Late_Night,Arrival Time_Morning,Arrival Time_Night
0,0,0,130,1,5953,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
1,0,0,140,1,5953,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0
2,0,0,130,1,5956,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0
3,0,0,135,1,5955,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
4,0,0,140,1,5955,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0


In [24]:
# Scaling Numerical Features
scaler = StandardScaler()
numerical_cols = ['Duration', 'Days Left']

# Apply the scaler to the numerical columns
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Display the final processed dataframe
print("Final Processed Data:")
display(df_encoded.head())

Final Processed Data:


Unnamed: 0,Stops,Class,Duration,Days Left,Price,Airline_Air_India,Airline_GO_FIRST,Airline_Indigo,Airline_SpiceJet,Airline_Vistara,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai,Departure Time_Early_Morning,Departure Time_Evening,Departure Time_Late_Night,Departure Time_Morning,Departure Time_Night,Arrival Time_Early_Morning,Arrival Time_Evening,Arrival Time_Late_Night,Arrival Time_Morning,Arrival Time_Night
0,0,0,-1.397994,-1.843875,5953,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
1,0,0,-1.37482,-1.843875,5953,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0
2,0,0,-1.397994,-1.843875,5956,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0
3,0,0,-1.386407,-1.843875,5955,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
4,0,0,-1.37482,-1.843875,5955,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0


In [21]:
df_encoded.shape

(300153, 30)

In [26]:
# .T swaps rows and columns
display(df_encoded.head().T)

Unnamed: 0,0,1,2,3,4
Stops,0.0,0.0,0.0,0.0,0.0
Class,0.0,0.0,0.0,0.0,0.0
Duration,-1.397994,-1.37482,-1.397994,-1.386407,-1.37482
Days Left,-1.843875,-1.843875,-1.843875,-1.843875,-1.843875
Price,5953.0,5953.0,5956.0,5955.0,5955.0
Airline_Air_India,0.0,0.0,0.0,0.0,0.0
Airline_GO_FIRST,0.0,0.0,0.0,0.0,0.0
Airline_Indigo,0.0,0.0,0.0,0.0,0.0
Airline_SpiceJet,1.0,1.0,0.0,0.0,0.0
Airline_Vistara,0.0,0.0,0.0,1.0,1.0


In [27]:
# Save the processed DataFrame to a CSV file
# index=False prevents pandas from adding an extra column for the row numbers
df_encoded.to_csv('flight-price-prediction/flight_price_processed.csv', index=False)

print("Dataset saved successfully!")

Dataset saved successfully!
