In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
df=pd.read_csv('Clean_Dataset.csv',index_col=0)
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  object 
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 27.5+ MB


In [None]:
df.columns

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class', 'duration', 'days_left',
       'price'],
      dtype='object')

## Integer Mapping of stops

In [None]:
df['stops'].unique()

array(['zero', 'one', 'two_or_more'], dtype=object)

In [None]:
stops_mapping = {'zero': 0 , 'one': 1, 'two_or_more': 2}
df['stops'] = df['stops'].map(stops_mapping)


In [None]:
df['stops'].value_counts()

Unnamed: 0_level_0,count
stops,Unnamed: 1_level_1
1,250863
0,36004
2,13286


In [None]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,4,660,0,3.0,,4.0,1.0,0,2.17,1.0,5953.0
1,4,640,0,0.0,,1.0,1.0,0,2.33,1.0,5953.0
2,0,568,0,0.0,,0.0,1.0,0,2.17,1.0,5956.0
3,5,765,0,1.0,,2.0,1.0,0,2.25,1.0,5955.0
4,5,755,0,1.0,,1.0,1.0,0,2.33,1.0,5955.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65697 entries, 0 to 65696
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           65697 non-null  object 
 1   flight            65697 non-null  object 
 2   source_city       65697 non-null  object 
 3   departure_time    65697 non-null  object 
 4   stops             65696 non-null  float64
 5   arrival_time      65696 non-null  object 
 6   destination_city  65696 non-null  object 
 7   class             65696 non-null  object 
 8   duration          65696 non-null  float64
 9   days_left         65696 non-null  float64
 10  price             65696 non-null  float64
dtypes: float64(4), object(7)
memory usage: 6.0+ MB


In [None]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,0,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,0,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,0,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,0,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,0,Morning,Mumbai,Economy,2.33,1,5955


Pre-processing




In [None]:
time_mapping = {
    'Early_Morning': 0,
    'Morning': 1,
    'Afternoon': 2,
    'Evening': 3,
    'Night': 4,
    'Late_Night': 5
}

df['arrival_time'] = df['arrival_time'].map(time_mapping)
df['departure_time'] = df['departure_time'].map(time_mapping)


In [None]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,3,0,4,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,0,0,1,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,0,0,0,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,1,0,2,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,1,0,1,Mumbai,Economy,2.33,1,5955


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le_airline = LabelEncoder()
le_class = LabelEncoder()
le_flight = LabelEncoder()

# Fit and transform the airline and class columns
df['airline'] = le_airline.fit_transform(df['airline'])
df['class'] = le_class.fit_transform(df['class'])
df['flight'] = le_class.fit_transform(df['flight'])



In [None]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,4,1408,Delhi,3,0,4,Mumbai,1,2.17,1,5953
1,4,1387,Delhi,0,0,1,Mumbai,1,2.33,1,5953
2,0,1213,Delhi,0,0,0,Mumbai,1,2.17,1,5956
3,5,1559,Delhi,1,0,2,Mumbai,1,2.25,1,5955
4,5,1549,Delhi,1,0,1,Mumbai,1,2.33,1,5955


In [None]:
city_mapping = {
    'Delhi': 0,
    'Mumbai': 1,
    'Bangalore': 2,
    'Kolkata': 3,
    'Hyderabad': 4,
    'Chennai': 5
}

df['source_city'] = df['source_city'].map(city_mapping)
df['destination_city'] = df['destination_city'].map(city_mapping)


## Final Processed Dataset

In [None]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,4,1408,0,3,0,4,1,1,2.17,1,5953
1,4,1387,0,0,0,1,1,1,2.33,1,5953
2,0,1213,0,0,0,0,1,1,2.17,1,5956
3,5,1559,0,1,0,2,1,1,2.25,1,5955
4,5,1549,0,1,0,1,1,1,2.33,1,5955


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  int64  
 1   flight            300153 non-null  int64  
 2   source_city       300153 non-null  int64  
 3   departure_time    300153 non-null  int64  
 4   stops             300153 non-null  int64  
 5   arrival_time      300153 non-null  int64  
 6   destination_city  300153 non-null  int64  
 7   class             300153 non-null  int64  
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(10)
memory usage: 27.5 MB


In [None]:
# storing the Dependent Variables in X and Independent Variable in Y
X=df.drop(['price'],axis=1)
y=df['price']

In [None]:
# Splitting the Data into Training set and Testing Set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((210107, 10), (90046, 10), (210107,), (90046,))

In [None]:
X.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,4,1408,0,3,0,4,1,1,2.17,1
1,4,1387,0,0,0,1,1,1,2.33,1
2,0,1213,0,0,0,0,1,1,2.17,1
3,5,1559,0,1,0,2,1,1,2.25,1
4,5,1549,0,1,0,1,1,1,2.33,1


In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


In [None]:
y_pred = lin_reg.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 5. Print results
print(f"Mean Squared Error: {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")

Mean Squared Error: 48157230.156861916
R-squared (Accuracy): 90.64%


In [None]:
#RandomForest
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)


In [None]:
y_pred = rf_regressor.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")


Mean Squared Error: 5471880.385162119
R-squared: 0.9893681046526155


In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, y_train)

In [None]:
y_pred = dt_regressor.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")

Mean Squared Error: 8859011.072528362
R-squared: 0.9827868900680197


In [None]:
#KNN
from sklearn.neighbors import KNeighborsRegressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)


In [None]:
y_pred_knn = knn_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred_knn)
r2 = r2_score(y_test, y_pred_knn)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")

Mean Squared Error: 275181584.12686807
R-squared: 0.4653205848764782


In [None]:
#ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(n_estimators=100, random_state=42)
etr.fit(X_train, y_train)

y_pred = etr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")


Mean Squared Error: 6363115.95315174
R^2 Score: 0.9876364287712445


In [None]:
!pip install scikit-learn --upgrade



In [None]:
#Bagging Regressor
from sklearn.ensemble import BaggingRegressor
base_estimator = DecisionTreeRegressor(random_state=42)
bagging_regressor = BaggingRegressor(estimator=base_estimator, n_estimators=300, random_state=42)
bagging_regressor.fit(X_train, y_train)

y_pred = bagging_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")

Mean Squared Error: 5447887.383248699
R^2 Score: 0.9894147231945896


In [None]:
#XGB
import xgboost as xgb
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")


Mean Squared Error (MSE): 12439532.052669493
R-squared (R²) score: 0.9758681654930115


In [None]:
#Ridge Regression
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)

y_pred = ridge_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")


Mean Squared Error (MSE): 48328395.578171566
R-squared (R²) score: 0.9062462512171827


In [None]:
#Lasso Regression
from sklearn.linear_model import Lasso
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lasso_reg = Lasso(alpha=1.0)
lasso_reg.fit(X_train, y_train)

y_pred = lasso_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (Accuracy): {r2 * 100:.4f}%")


Mean Squared Error (MSE): 48328075.808948964
R-squared (R²) score: 0.9062468715473841
