In [21]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Evaluation
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV

# Saving and loading models
import joblib

**1. Loading the dataset**

In [3]:
# Loading the dataset
df = pd.read_csv('data/Clean_Dataset_EDA_Processed.csv')
df

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,0,Night,Mumbai,economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,0,Morning,Mumbai,economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,0,Early_Morning,Mumbai,economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,0,Afternoon,Mumbai,economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,0,Morning,Mumbai,economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,1,Evening,Hyderabad,business,10.08,49,69265
300149,Vistara,UK-826,Chennai,Afternoon,1,Night,Hyderabad,business,10.42,49,77105
300150,Vistara,UK-832,Chennai,Early_Morning,1,Night,Hyderabad,business,13.83,49,79099
300151,Vistara,UK-828,Chennai,Early_Morning,1,Evening,Hyderabad,business,10.00,49,81585


In [5]:
#First we need to drop the flight column as it is a necessary feature for the model
df_model = df.drop(['flight'], axis=1)

# Defining features and target variable
X = df_model.drop('price', axis=1)
y = df_model['price']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets.")
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Data split into training and testing sets.
Training set size: (240122, 9)
Testing set size: (60031, 9)


**2. Feature Engineering**

In [9]:
# Defining categorical and numerical features
categorical_features = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
numerical_features = ['duration', 'days_left']

# Creating a preprocessing pipeline for the categorical and numerical features
# Pipeline for numerical features, we will scale them making sure they have a mean of 0 and standard deviation of 1
numerical_pipeline = Pipeline(steps=[('scaler', StandardScaler())])

# Pipeline for categorical features, we will one-hot encode them
categorical_pipeline = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combining both pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

print("Preprocessing pipelines created for numerical and categorical features.")

Preprocessing pipelines created for numerical and categorical features.


In [10]:
# Fitting the preprocessor on the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Transforming the test data using the fitted preprocessor to ensure consistency
X_test_processed = preprocessor.transform(X_test)

print("Data preprocessing completed.")
print("Transformed training set shape:", X_train_transformed.shape)
print("Transformed testing set shape:", X_test_processed.shape)

Data preprocessing completed.
Transformed training set shape: (240122, 37)
Transformed testing set shape: (60031, 37)


**3. Model Building**

In [26]:
# Seelcting a small subset of the data for faster training and testing
sample_size = 150000
X_train_sample = X_train_transformed[:sample_size]
y_train_sample = y_train.iloc[:sample_size]

# log transforming the sampled target variable to handle skewness
y_train_sample_log = np.log1p(y_train_sample)

# Defining the models to be trained
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1)
}

#Creating a list to store the results
results = []

# Looping through each model, fitting it, and evaluating its performance
for model_name, model in models.items():
    # Fitting the model
    model.fit(X_train_sample, y_train_sample_log)
    
    # Making predictions on the test set
    y_pred_log = model.predict(X_test_processed)
    
    # Inverse transforming the predictions to get them back to original scale
    y_pred = np.expm1(y_pred_log)
    
    # Calculating evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Results for {model_name}:")
    print(f"  Mean Absolute Error (MAE): {mae:,.0f}")
    print(f"  R-squared (R²): {r2:.4f}")
    
    # Storing the results
    results.append({
        'Model': model_name,
        'MAE': mae,
        'R2 Score': r2
    })
    
# Converting the results to a DataFrame for better visualization   
results_df = pd.DataFrame(results)

print("\n\n--- Model Performance Results (from sample) ---")
display(results_df.sort_values(by='MAE'))

Results for Linear Regression:
  Mean Absolute Error (MAE): 4,567
  R-squared (R²): 0.8825
Results for Random Forest:
  Mean Absolute Error (MAE): 1,130
  R-squared (R²): 0.9843
Results for XGBoost:
  Mean Absolute Error (MAE): 2,108
  R-squared (R²): 0.9711
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034928 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 375
[LightGBM] [Info] Number of data points in the train set: 150000, number of used features: 37
[LightGBM] [Info] Start training from score 9.332795




Results for LightGBM:
  Mean Absolute Error (MAE): 2,469
  R-squared (R²): 0.9623


--- Model Performance Results (from sample) ---


Unnamed: 0,Model,MAE,R2 Score
1,Random Forest,1130.153957,0.984343
2,XGBoost,2108.286865,0.971106
3,LightGBM,2468.553142,0.962289
0,Linear Regression,4566.511716,0.882534


**4. Model Tuning**

In [27]:
'''# Hyperparameter tuning for the best model - Random Forest
# Providing the range of values for hyperparameters to be tuned
param_grid = {
    'n_estimators': [100, 200, 300, None],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 1.0]
}

# Setting up the RandomizedSearchCV for hyperparameter tuning
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Performing Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=50,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fitting the RandomizedSearchCV on the sampled training data
random_search.fit(X_train_sample, y_train_sample_log)

# Getting the best model from the random search
best_rf_model = random_search.best_estimator_

# Evaluating the new, tuned model on the full test set
y_pred_log_tuned = best_rf_model.predict(X_test_processed)
y_pred_tuned = np.expm1(y_pred_log_tuned)

# Calculating evaluation metrics for the tuned model
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\n--- Evaluation of Tuned Random Forest Model ---")
print(f"  Tuned Mean Absolute Error (MAE): {mae_tuned:,.0f}")
print(f"  Tuned R-squared (R²): {r2_tuned:.4f}")

# Compare with the default model's performance
default_mae = results_df[results_df['Model'] == 'Random Forest']['MAE'].values[0]
print(f"\n  Default MAE for comparison: {default_mae:,.0f}")

if mae_tuned < default_mae:
    print("\nSuccess! The tuned model is more accurate than the default model.")
else:
    print("\nThe tuned model did not improve upon the default model's performance.")'''

'# Hyperparameter tuning for the best model - Random Forest\n# Providing the range of values for hyperparameters to be tuned\nparam_grid = {\n    \'n_estimators\': [100, 200, 300, None],\n    \'max_depth\': [10, 20, 30, None],\n    \'min_samples_split\': [2, 5, 10],\n    \'min_samples_leaf\': [1, 2, 4],\n    \'max_features\': [\'sqrt\', \'log2\', 1.0]\n}\n\n# Setting up the RandomizedSearchCV for hyperparameter tuning\nrf_model = RandomForestRegressor(random_state=42, n_jobs=-1)\n\n# Performing Randomized Search with cross-validation\nrandom_search = RandomizedSearchCV(\n    estimator=rf_model,\n    param_distributions=param_grid,\n    n_iter=50,\n    cv=3,\n    verbose=2,\n    random_state=42,\n    n_jobs=-1\n)\n\n# Fitting the RandomizedSearchCV on the sampled training data\nrandom_search.fit(X_train_sample, y_train_sample_log)\n\n# Getting the best model from the random search\nbest_rf_model = random_search.best_estimator_\n\n# Evaluating the new, tuned model on the full test set\ny

**5. Model Training of Full Dataset**

In [28]:
# Model training on the full training set
champ_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Fitting the champion model on the entire processes training set
champ_model.fit(X_train_transformed, np.log1p(y_train))

print("\nChampion model trained on the full training set.")


Champion model trained on the full training set.


In [29]:
# Saving the champion model and the preprocessor
joblib.dump(champ_model, 'flight_price_model.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')

print("\nChampion model and preprocessor have been saved to files:")
print("- flight_price_model.joblib")
print("- preprocessor.joblib")


Champion model and preprocessor have been saved to files:
- flight_price_model.joblib
- preprocessor.joblib
