In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("final.csv")
df.head()

Unnamed: 0,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,...,day,Total_Stops,Dgree,Duration(Minutes),day_number,Price_USD,Dep_Time_hour,Dep_Time_min,Arrival_Time_hour,Arrival_Time_min
0,0,0,1,0,0,0,0,0,0,0,...,24,0.0,0,170,6,44.793103,22,20,1,10
1,1,0,0,0,0,0,0,0,0,0,...,1,2.0,1,445,2,88.068966,5,50,13,15
2,0,0,0,1,0,0,0,0,0,0,...,9,2.0,1,1140,6,159.563218,9,25,4,25
3,0,0,1,0,0,0,0,0,0,0,...,12,1.0,0,325,6,71.471264,18,5,23,30
4,0,0,1,0,0,0,0,0,0,0,...,1,1.0,1,285,4,152.896552,16,50,21,35


In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Price_USD'])
y = df['Price_USD']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[['Duration(Minutes)', 'Total_Stops']] = scaler.fit_transform(X_train[['Duration(Minutes)', 'Total_Stops']])
X_test[['Duration(Minutes)', 'Total_Stops']] = scaler.transform(X_test[['Duration(Minutes)', 'Total_Stops']])

In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)  
    predictions = model.predict(X_test)  

    mae = mean_absolute_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)

    print(f"Model: {model.__class__.__name__}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")

In [6]:
# linear regression model
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
evaluate_model(model_lr, X_train, X_test, y_train, y_test)

Model: LinearRegression
MAE: 20.0463
RMSE: 27.2313
R² Score: 0.7397


In [7]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(
    n_estimators=100,  # Increase trees for more stability
    max_depth=14,  # Slightly increase depth for better learning
    min_samples_leaf=5,  
    max_features="sqrt",
    bootstrap=True,
    random_state=42
)
evaluate_model(model_rf, X_train, X_test, y_train, y_test)

Model: RandomForestRegressor
MAE: 13.6117
RMSE: 20.8861
R² Score: 0.8469


In [8]:
def check_overfitting(model, X_train, X_test, y_train, y_test):
    train_score = model.score(X_train, y_train)  # R² Score on training set
    test_score = model.score(X_test, y_test)  # R² Score on test set
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Training R² Score: {train_score:.4f}")
    print(f"Testing R² Score: {test_score:.4f}")

In [9]:
check_overfitting(model_rf, X_train, X_test, y_train, y_test)

Model: RandomForestRegressor
Training R² Score: 0.8570
Testing R² Score: 0.8469


In [10]:
import numpy as np
feature_importances = model_rf.feature_importances_
features = np.array(X_train.columns)

sorted_idx = np.argsort(feature_importances)[::-1]

print("🔹 Feature Importance Ranking:")
for i in sorted_idx:
    print(f"{features[i]}: {feature_importances[i]:.4f}")

🔹 Feature Importance Ranking:
Dgree: 0.3544
Duration(Minutes): 0.1488
Total_Stops: 0.1073
Short: 0.0666
Jet Airways: 0.0483
day : 0.0375
month: 0.0296
IndiGo: 0.0253
Dest_Delhi: 0.0188
day_number: 0.0179
Arrival_Time_hour: 0.0150
SpiceJet: 0.0146
Dep_Time_hour: 0.0136
Delhi: 0.0136
Dest_New Delhi: 0.0126
Dep_Time_min: 0.0117
Arrival_Time_min: 0.0106
Medium: 0.0089
Dest_Cochin: 0.0081
Air India: 0.0076
Multiple carriers: 0.0067
Mumbai: 0.0053
Kolkata: 0.0048
Dest_Hyderabad: 0.0037
Jet Airways Business: 0.0026
Vistara: 0.0024
GoAir: 0.0020
Dest_Kolkata: 0.0010
Chennai: 0.0004
Multiple carriers Premium economy: 0.0001
Vistara Premium economy: 0.0000
Trujet: 0.0000


In [11]:
un_important = [
    'GoAir', 'Vistara', 'Chennai',
    'Dest_Kolkata',  
    'Multiple carriers Premium economy', 'Vistara Premium economy', 'Trujet'
]

X_train = X_train.drop(columns=un_important)
X_test = X_test.drop(columns=un_important)

In [12]:
model_rf = RandomForestRegressor(
    n_estimators=100,  # Increase trees for more stability
    max_depth=14,  # Slightly increase depth for better learning
    min_samples_leaf=5,  
    max_features="sqrt",
    bootstrap=True,
    random_state=42
)
evaluate_model(model_rf, X_train, X_test, y_train, y_test)

Model: RandomForestRegressor
MAE: 12.9757
RMSE: 20.1115
R² Score: 0.8580


In [13]:
check_overfitting(model_rf, X_train, X_test, y_train, y_test)

Model: RandomForestRegressor
Training R² Score: 0.8742
Testing R² Score: 0.8580


In [14]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    min_child_weight=5,
    colsample_bytree=0.6,
    random_state=42
)
evaluate_model(model_xgb, X_train, X_test, y_train, y_test)

Model: XGBRegressor
MAE: 12.3565
RMSE: 17.5177
R² Score: 0.8923


In [15]:
check_overfitting(model_xgb, X_train, X_test, y_train, y_test)

Model: XGBRegressor
Training R² Score: 0.9169
Testing R² Score: 0.8923


In [16]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.05, 0.08, 0.1],  
    'max_depth': [6, 8, 10],             
    'min_child_weight': [3, 5, 7],  
    'subsample': [0.7, 0.8, 1.0],  
    'colsample_bytree': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=30,
    cv=3,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Best model after tuning
best_xgb = random_search.best_estimator_
evaluate_model(best_xgb, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Model: XGBRegressor
MAE: 11.1117
RMSE: 16.7119
R² Score: 0.9020


In [17]:
check_overfitting(best_xgb, X_train, X_test, y_train, y_test)

Model: XGBRegressor
Training R² Score: 0.9403
Testing R² Score: 0.9020


In [19]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import  GradientBoostingRegressor


base_models = [
    ('rf', model_rf),
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('xgb', best_xgb)
]

# Meta-model
meta_model = XGBRegressor(n_estimators=50, learning_rate=0.1, random_state=42)

# Build the Stacking Regressor
stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

# Train & evaluate
evaluate_model(stacked_model, X_train, X_test, y_train, y_test)
check_overfitting(stacked_model, X_train, X_test, y_train, y_test)


Model: StackingRegressor
MAE: 11.0732
RMSE: 18.0506
R² Score: 0.8856
Model: StackingRegressor
Training R² Score: 0.9247
Testing R² Score: 0.8856


In [21]:
import joblib

# Save the model
joblib.dump(best_xgb, 'xgb_best_model.pkl')

['xgb_best_model.pkl']

In [26]:
(X_train.columns)

Index(['Air India', 'IndiGo', 'Jet Airways', 'Jet Airways Business',
       'Multiple carriers', 'SpiceJet', 'Delhi', 'Kolkata', 'Mumbai',
       'Dest_Cochin', 'Dest_Delhi', 'Dest_Hyderabad', 'Dest_New Delhi',
       'Medium', 'Short', 'month', 'day ', 'Total_Stops', 'Dgree',
       'Duration(Minutes)', 'day_number', 'Dep_Time_hour', 'Dep_Time_min',
       'Arrival_Time_hour', 'Arrival_Time_min'],
      dtype='object')