In [105]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression



In [106]:
data = pd.read_csv('../data/flightPrice.csv')
data.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare
0,2023-01-16,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335
1,2023-01-16,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899
2,2023-01-16,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801
3,2023-01-16,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794
4,2023-01-16,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955


In [107]:
data.describe()

Unnamed: 0,Duration_in_hours,Days_left,Fare
count,452088.0,452088.0,452088.0
mean,12.349222,25.627902,22840.10089
std,7.431478,14.300846,20307.963002
min,0.75,1.0,1307.0
25%,6.5833,13.0,8762.75
50%,11.3333,26.0,13407.0
75%,16.5,38.0,35587.0
max,43.5833,50.0,143019.0


In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452088 entries, 0 to 452087
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date_of_journey    452088 non-null  object 
 1   Journey_day        452088 non-null  object 
 2   Airline            452088 non-null  object 
 3   Flight_code        452088 non-null  object 
 4   Class              452088 non-null  object 
 5   Source             452088 non-null  object 
 6   Departure          452088 non-null  object 
 7   Total_stops        452088 non-null  object 
 8   Arrival            452088 non-null  object 
 9   Destination        452088 non-null  object 
 10  Duration_in_hours  452088 non-null  float64
 11  Days_left          452088 non-null  int64  
 12  Fare               452088 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 44.8+ MB


In [109]:
data.shape

(452088, 13)

In [110]:
# Convert 'Date_of_journey' to datetime
data['Date_of_journey'] = pd.to_datetime(data['Date_of_journey'])

# Extract day of the week and month from 'Date_of_journey'
data['Weekday'] = data['Date_of_journey'].dt.day_name()
data['Month'] = data['Date_of_journey'].dt.month
data.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare,Weekday,Month
0,2023-01-16,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335,Monday,1
1,2023-01-16,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899,Monday,1
2,2023-01-16,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801,Monday,1
3,2023-01-16,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794,Monday,1
4,2023-01-16,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955,Monday,1


In [111]:
encoder = LabelEncoder()
categorical_columns = ['Airline', 'Class', 'Source', 'Destination', 'Departure', 'Arrival']
for col in categorical_columns:
    data[col] = encoder.fit_transform(data[col])
data.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare,Weekday,Month
0,2023-01-16,Monday,6,SG-8169,1,3,2,non-stop,2,6,2.0833,1,5335,Monday,1
1,2023-01-16,Monday,5,6E-2519,1,3,2,non-stop,3,6,2.3333,1,5899,Monday,1
2,2023-01-16,Monday,4,G8-354,1,3,2,non-stop,3,6,2.1667,1,5801,Monday,1
3,2023-01-16,Monday,6,SG-8709,1,3,2,non-stop,2,6,2.0833,1,5794,Monday,1
4,2023-01-16,Monday,0,AI-805,1,3,2,non-stop,2,6,2.1667,1,5955,Monday,1


In [112]:
print(data['Total_stops'].unique())

['non-stop' '1-stop' '2+-stop']


In [113]:
data['Total_stops'] = data['Total_stops'].map({'non-stop': 0, '1-stop': 1, '2+-stop': 2})
data['Weekday'] = data['Weekday'].map({
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
})
data.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare,Weekday,Month
0,2023-01-16,Monday,6,SG-8169,1,3,2,0,2,6,2.0833,1,5335,1,1
1,2023-01-16,Monday,5,6E-2519,1,3,2,0,3,6,2.3333,1,5899,1,1
2,2023-01-16,Monday,4,G8-354,1,3,2,0,3,6,2.1667,1,5801,1,1
3,2023-01-16,Monday,6,SG-8709,1,3,2,0,2,6,2.0833,1,5794,1,1
4,2023-01-16,Monday,0,AI-805,1,3,2,0,2,6,2.1667,1,5955,1,1


In [114]:
data.isnull().sum()

Date_of_journey      0
Journey_day          0
Airline              0
Flight_code          0
Class                0
Source               0
Departure            0
Total_stops          0
Arrival              0
Destination          0
Duration_in_hours    0
Days_left            0
Fare                 0
Weekday              0
Month                0
dtype: int64

In [115]:
data.dtypes

Date_of_journey      datetime64[ns]
Journey_day                  object
Airline                       int32
Flight_code                  object
Class                         int32
Source                        int32
Departure                     int32
Total_stops                   int64
Arrival                       int32
Destination                   int32
Duration_in_hours           float64
Days_left                     int64
Fare                          int64
Weekday                       int64
Month                         int32
dtype: object

In [116]:
X = data.drop(columns=['Fare', 'Flight_code', 'Journey_day', 'Date_of_journey'])
y = data['Fare']

In [117]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print('Training set shape:', X_train.shape)
print('Validation set shape:', X_val.shape)
print('Test set shape:', X_test.shape)

Training set shape: (316461, 11)
Validation set shape: (67813, 11)
Test set shape: (67814, 11)


In [118]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)



In [119]:
# Validate the model
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)
print('Validation MSE:', val_mse)
print('Validation R2:', val_r2)

# Test the model
y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print('Test MSE:', test_mse)
print('Test R2:', test_r2)

Validation MSE: 249375952.09356353
Validation R2: 0.3975256345629019
Test MSE: 246889108.98283064
Test R2: 0.39859270896383747


In [120]:
# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [121]:
# Validate the model
y_val_pred_rf = rf_model.predict(X_val)
val_mse_rf = mean_squared_error(y_val, y_val_pred_rf)
val_r2_rf = r2_score(y_val, y_val_pred_rf)

print('Validation MSE (Random Forest):', val_mse_rf)
print('Validation R2 (Random Forest):', val_r2_rf)


Validation MSE (Random Forest): 19783856.434670188
Validation R2 (Random Forest): 0.952203625685188


In [122]:
# Test the model
y_test_pred_rf = rf_model.predict(X_test)
test_mse_rf = mean_squared_error(y_test, y_test_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

print('Test MSE (Random Forest):', test_mse_rf)
print('Test R2 (Random Forest):', test_r2_rf)

Test MSE (Random Forest): 19063070.90363536
Test R2 (Random Forest): 0.953563484925603


# predict using new data

In [151]:
# Load the new data from the uploaded Excel file
ndf = pd.read_excel('../data/Dummy data.xlsx')

# Display the first few rows of the new dataframe
ndf.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [152]:
ndf['Duration'].unique()

array(['2h 50m', '7h 25m', '19h', '5h 25m', '4h 45m', '2h 25m', '15h 30m',
       '21h 5m', '25h 30m', '7h 50m', '13h 15m', '2h 35m', '2h 15m',
       '12h 10m', '26h 35m', '4h 30m', '22h 35m', '23h', '20h 35m',
       '5h 10m', '15h 20m', '2h 55m', '13h 20m', '15h 10m', '5h 45m',
       '5h 55m', '13h 25m', '22h', '5h 30m', '10h 25m', '5h 15m',
       '2h 30m', '6h 15m', '11h 55m', '11h 5m', '8h 30m', '22h 5m',
       '2h 45m', '12h', '16h 5m', '19h 55m', '3h 15m', '25h 20m', '3h',
       '16h 15m', '15h 5m', '6h 30m', '25h 5m', '12h 25m', '27h 20m',
       '10h 15m', '10h 30m', '1h 30m', '1h 25m', '26h 30m', '7h 20m',
       '13h 30m', '5h', '19h 5m', '14h 50m', '2h 40m', '22h 10m',
       '9h 35m', '10h', '21h 20m', '18h 45m', '12h 20m', '18h', '9h 15m',
       '17h 30m', '16h 35m', '12h 15m', '7h 30m', '24h', '8h 55m',
       '7h 10m', '14h 30m', '30h 20m', '15h', '12h 45m', '10h 10m',
       '15h 25m', '14h 5m', '20h 15m', '23h 10m', '18h 10m', '16h',
       '2h 20m', '8h', '16h 5

In [153]:
# Convert date columns to datetime objects
ndf['Date_of_Journey'] = pd.to_datetime(ndf['Date_of_Journey'], format='%d/%m/%Y')
ndf['Dep_Time'] = pd.to_datetime(ndf['Dep_Time']).dt.time
ndf['Arrival_Time'] = pd.to_datetime(ndf['Arrival_Time']).dt.time

  ndf['Dep_Time'] = pd.to_datetime(ndf['Dep_Time']).dt.time
  ndf['Arrival_Time'] = pd.to_datetime(ndf['Arrival_Time']).dt.time


In [155]:
# Correct the lambda function to handle cases where minutes might not be specified
def parse_duration(duration):
    parts = duration.split(' ')
    hours = int(parts[0].replace('h', '')) if 'h' in parts[0] else 0
    minutes = int(parts[1].replace('m', '')) if len(parts) > 1 else 0
    return hours * 60 + minutes

ndf['Duration'] = ndf['Duration'].apply(parse_duration)

In [157]:
# Encode categorical variables
label_encoder = LabelEncoder()
ndf['Airline'] = label_encoder.fit_transform(ndf['Airline'])
ndf['Source'] = label_encoder.fit_transform(ndf['Source'])
ndf['Destination'] = label_encoder.fit_transform(ndf['Destination'])
ndf['Route'] = label_encoder.fit_transform(ndf['Route'])
ndf['Total_Stops'] = label_encoder.fit_transform(ndf['Total_Stops'])
ndf['Additional_Info'] = label_encoder.fit_transform(ndf['Additional_Info'])

In [158]:
# Handle missing values (if any)
ndf = ndf.dropna()

# Convert 'Dep_Time' and 'Arrival_Time' to minutes past midnight
def time_to_minutes(t):
    return t.hour * 60 + t.minute

ndf['Dep_Time'] = ndf['Dep_Time'].apply(time_to_minutes)
ndf['Arrival_Time'] = ndf['Arrival_Time'].apply(time_to_minutes)

# Ensure all datetime columns are converted to numerical format
ndf['Date_of_Journey'] = ndf['Date_of_Journey'].apply(lambda x: x.toordinal())

In [159]:
# Split the data into training and testing sets
X = ndf.drop(['Price'], axis=1)
y = ndf['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [160]:
# Train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

In [162]:
# Make predictions
y_pred = rf_regressor.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
# Calculate the R2 score
r2 = r2_score(y_test, y_pred)

# Display the mean squared error
print('Mean Squared Error:', mse)
# Display the R2 score
print('R2 Score:', r2)

Mean Squared Error: 3068578.476734092
R2 Score: 0.8550092894374807
