In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
df = pd.read_csv('processed_flight_prices.csv')

In [3]:
df.head(3)

Unnamed: 0,Flight Date,Airline,Flight Number,Class,Origin,Departure Time,Destination,Arrival Time,Duration (Minutes),Price (₹),Number of Stops,Date,Month,Year,Day
0,2023-06-26,SpiceJet,SG-8709,economy,Delhi,18:55,Mumbai,21:05,130,6013,0,26,6,2023,Monday
1,2023-06-26,SpiceJet,SG-8157,economy,Delhi,06:20,Mumbai,08:40,140,6013,0,26,6,2023,Monday
2,2023-06-26,AirAsia,I5-764,economy,Delhi,04:25,Mumbai,06:35,130,6016,0,26,6,2023,Monday


In [4]:
# Convert 'Flight Date' to datetime
df['Flight Date'] = pd.to_datetime(df['Flight Date'], errors='coerce')

In [5]:
# Drop Flight Number (not useful for prediction)
df.drop(columns=['Flight Number'], inplace=True)

In [6]:
# Convert 'Departure Time' & 'Arrival Time' into hours (numerical format)

df["Departure Hour"] = pd.to_datetime(df["Departure Time"], format="%H:%M", errors="coerce").dt.hour
df["Arrival Hour"] = pd.to_datetime(df["Arrival Time"], format="%H:%M", errors="coerce").dt.hour

In [7]:
# Drop original time columns as they are now encoded
df.drop(columns=["Departure Time", "Arrival Time"], errors="ignore", inplace=True)

In [8]:
# Label Encoding for ordinal variables
label_encoder = LabelEncoder()
df['Class'] = label_encoder.fit_transform(df['Class'])

In [9]:
# 📌 Step 5: One-Hot Encode Categorical Features
categorical_cols = ["Airline", "Day", "Origin", "Destination"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("✅ Encoding Complete!")

✅ Encoding Complete!


In [10]:
df.head(3)

Unnamed: 0,Flight Date,Class,Duration (Minutes),Price (₹),Number of Stops,Date,Month,Year,Departure Hour,Arrival Hour,...,Origin_Chennai,Origin_Delhi,Origin_Hyderabad,Origin_Kolkata,Origin_Mumbai,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai
0,2023-06-26,1,130,6013,0,26,6,2023,18,21,...,False,True,False,False,False,False,False,False,False,True
1,2023-06-26,1,140,6013,0,26,6,2023,6,8,...,False,True,False,False,False,False,False,False,False,True
2,2023-06-26,1,130,6016,0,26,6,2023,4,6,...,False,True,False,False,False,False,False,False,False,True


### Feature Scaling & Splitting

In [12]:
# Select Features and Target
X = df.drop(columns=['Price (₹)', 'Flight Date'])
y = df['Price (₹)']

In [13]:
# Identify numerical columns
numerical_cols = ["Number of Stops", "Duration (Minutes)", "Date", "Month", "Year", "Departure Hour", "Arrival Hour"]

In [14]:
# Separate numerical and categorical features
X_numerical = X[numerical_cols]
X_categorical = X.drop(columns=numerical_cols)

In [15]:
# Standardize only numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

In [16]:
# Recombine scaled numerical features with categorical features
X_scaled = np.hstack((X_numerical_scaled, X_categorical))

In [17]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("✅ Train-Test Split Complete!")

✅ Train-Test Split Complete!


### Model Training

In [19]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=15, min_samples_split=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {"MAE": mae, "RMSE": rmse, "R² Score": r2}


In [20]:
# Display model results
results_df = pd.DataFrame(results).T
print("✅ Model Training Complete! Results:")
print(results_df)

✅ Model Training Complete! Results:
                           MAE         RMSE  R² Score
Linear Regression  4592.626051  6951.928228  0.909456
Random Forest      1777.790323  3373.447094  0.978680
Gradient Boosting  3038.886371  5074.414646  0.951759


### Hyperparameter Optimization

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [23]:
param_dist = {
    "n_estimators": [50, 100, 150],
    "max_depth": [10, 15], 
    "min_samples_split": [10, 15], 
    "min_samples_leaf": [5, 10] 
}

# Randomized Search
random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=10, 
    cv=5, 
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    random_state=42
)

# Fit model
random_search.fit(X_train, y_train)
print("✅ Best Hyperparameters:", random_search.best_params_)

✅ Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 15}


In [24]:
# Train Best Model
best_model = random_search.best_estimator_  
y_pred = best_model.predict(X_test)

# Test Performance Evaluation
final_mae = mean_absolute_error(y_test, y_pred)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
final_r2 = r2_score(y_test, y_pred)
print(f"🎯 Final Model Performance: MAE = {final_mae:.2f}, RMSE = {final_rmse:.2f}, R² Score = {final_r2:.2f}")

🎯 Final Model Performance: MAE = 1791.72, RMSE = 3396.56, R² Score = 0.98


In [25]:
# Training Performance Evaluation
y_train_pred = best_model.predict(X_train)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)
print(f"🏋️‍♂️ Training Performance: MAE = {train_mae:.2f}, RMSE = {train_rmse:.2f}, R² Score = {train_r2:.2f}")

🏋️‍♂️ Training Performance: MAE = 1678.20, RMSE = 3174.02, R² Score = 0.98


### Save the Model

In [27]:
# Save the trained model
joblib.dump(best_model, "optimized_flight_model.pkl")

# # Save feature column names
joblib.dump(list(X.columns), "flight_features.pkl")

print("✅ Model saved successfully as optimized_flight_model.pkl")
print("👉 Feature Columns saved as flight_features.pkl")

✅ Model saved successfully as optimized_flight_model.pkl
👉 Feature Columns saved as flight_features.pkl


In [28]:
# Save the fitted scaler
joblib.dump(scaler, "flight_scaler.pkl")
print("✅ Scaler saved successfully as flight_scaler.pkl")

✅ Scaler saved successfully as flight_scaler.pkl
