In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle


In [3]:
df=pd.read_csv(r"C:\Users\Windows\Downloads\Clean_Dataset.csv\Clean_Dataset.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [6]:
df.shape

(300153, 12)

## DATA CLEANING 

In [7]:
df.rename(columns={
    'Unnamed: 0': 'index',
    'airline': 'Airline',
    'flight': 'Flight_Number',
    'source_city': 'Source_City',
    'departure_time': 'Departure_Time',
    'stops': 'Stops',
    'arrival_time': 'Arrival_Time',
    'destination_city': 'Destination_City',
    'class': 'Class',
    'duration': 'Duration',
    'days_left': 'Days_Left',
    'price': 'Price'
}, inplace=True)

In [8]:
# Drop Unnecessary Columns
df.drop(columns=['index', 'Flight_Number'], inplace=True)

In [9]:
df.head()

Unnamed: 0,Airline,Source_City,Departure_Time,Stops,Arrival_Time,Destination_City,Class,Duration,Days_Left,Price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [10]:
# Define Features and Target
X = df.drop(columns=['Price'])
y = df['Price']

In [11]:
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Define Column Transformer
numeric_features = ['Duration', 'Days_Left']
categorical_features = ['Airline', 'Source_City', 'Departure_Time', 'Stops', 'Arrival_Time', 'Destination_City', 'Class']

In [13]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])


In [14]:
xgb = XGBRegressor(
    n_estimators=2000,    
    random_state=42  
)

In [15]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', xgb)
])

In [16]:
pipeline.fit(X_train, y_train)

In [17]:
# Evaluate Model
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'XGBRegressor MAE: {mae}')

XGBRegressor MAE: 1267.5528245177877


In [18]:
for col in ["Airline", "Source_City", "Departure_Time", "Stops", "Arrival_Time", "Destination_City", "Class"]:
    print(f"Unique values in {col}: {df[col].unique()}\n")
    

Unique values in Airline: ['SpiceJet' 'AirAsia' 'Vistara' 'GO_FIRST' 'Indigo' 'Air_India']

Unique values in Source_City: ['Delhi' 'Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai']

Unique values in Departure_Time: ['Evening' 'Early_Morning' 'Morning' 'Afternoon' 'Night' 'Late_Night']

Unique values in Stops: ['zero' 'one' 'two_or_more']

Unique values in Arrival_Time: ['Night' 'Morning' 'Early_Morning' 'Afternoon' 'Evening' 'Late_Night']

Unique values in Destination_City: ['Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai' 'Delhi']

Unique values in Class: ['Economy' 'Business']



In [19]:
# Save the Best Model
with open('flight_price_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Tuned model saved as flight_price_pipeline.pkl")


Tuned model saved as flight_price_pipeline.pkl


In [20]:
# Load the saved pipeline
with open("flight_price_pipeline.pkl", "rb") as file:
    pipeline = pickle.load(file)

# Create a DataFrame with the same structure as training data
sample_input = pd.DataFrame([{
    "Airline": "SpiceJet",
    "Source_City": "Delhi",
    "Departure_Time": "Early_Morning",
    "Stops": "zero",
    "Arrival_Time": "Morning",
    "Destination_City": "Mumbai",
    "Class": "Economy",
    "Duration": 2.33,
    "Days_Left": 1
}])

# Make prediction
predicted_price = pipeline.predict(sample_input)

print(f"Predicted Flight Price: ₹{predicted_price[0]:.2f}")

Predicted Flight Price: ₹6405.79
