In [7]:
import pandas as pd
import numpy as np
from datetime import datetime

# Read the data
df = pd.read_csv('processed_airline_dataset.csv')

# Convert date columns to datetime and extract features
df['Date of Booking'] = pd.to_datetime(df['Date of Booking'], format='%d/%m/%Y')
df['Date of Journey'] = pd.to_datetime(df['Date of Journey'], format='%d/%m/%Y')

# Convert Duration to minutes
def convert_duration(duration):
    hours, minutes = 0, 0
    if 'h' in duration:
        hours = int(duration.split('h')[0])
        if 'm' in duration:
            minutes = int(duration.split('h')[1].replace('m', '').strip())
    elif 'm' in duration:
        minutes = int(duration.replace('m', '').strip())
    return hours * 60 + minutes

df['Duration'] = df['Duration'].apply(convert_duration)

# Create label encoders for categorical columns
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['Company', 'Class', 'Source', 'Destination', 
                      'DayOfWeek_Journey', 'TravelSeason', 
                      'Departure_TimeOfDay', 'Arrival_TimeOfDay']

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

# Drop unnecessary columns
columns_to_drop = ['Date of Booking', 'Date of Journey', 'Airline-Class', 
                   'FlightNumber', 'DepartureTime', 'ArrivalTime', 
                   'DayOfWeek_Booking', 'Possible Routes']
df = df.drop(columns_to_drop, axis=1)

print("Data shape after preprocessing:", df.shape)
print("\
Columns after preprocessing:")
print(df.columns.tolist())
print("\
Sample of processed data:")
print(df.head())

Data shape after preprocessing: (445366, 20)
Columns after preprocessing:
['Duration', 'Total Stops', 'Price', 'Company', 'Class', 'Source', 'Destination', 'DayOfWeek_Journey', 'Month_Journey', 'Weekend', 'BookingLeadTime', 'DepartureHour', 'ArrivalHour', 'IsEconomyClass', 'IsBusinessClass', 'IsFirstClass', 'TravelSeason', 'Distance', 'Departure_TimeOfDay', 'Arrival_TimeOfDay']
Sample of processed data:
   Duration  Total Stops  Price  Company  Class  Source  Destination  \
0       125          0.0   5335        6      1       3            6   
1       140          0.0   5899        5      1       3            6   
2       130          0.0   5801        4      1       3            6   
3       125          0.0   5794        6      1       3            6   
4       130          0.0   5955        0      1       3            6   

   DayOfWeek_Journey  Month_Journey  Weekend  BookingLeadTime  DepartureHour  \
0                  1              1        0                1             20   


In [8]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Prepare features and target
X = df.drop('Price', axis=1)
y = df['Price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize XGBoost model with optimized parameters
xgb_model = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=10,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    random_state=42
)

# Train the model
eval_set = [(X_test_scaled, y_test)]
xgb_model.fit(X_train_scaled, y_train, eval_set=eval_set)

# Make predictions
y_pred = xgb_model.predict(X_test_scaled)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\
Model Performance Metrics:")
print("R-squared score:", r2)
print("RMSE:", rmse)

# Calculate accuracy within different error margins
def calculate_accuracy_within_margin(y_true, y_pred, margin):
    within_margin = np.abs(y_true - y_pred) <= (y_true * margin)
    return np.mean(within_margin) * 100

margins = [0.01, 0.02, 0.05, 0.10]
print("\
Accuracy within different error margins:")
for margin in margins:
    accuracy = calculate_accuracy_within_margin(y_test, y_pred, margin)
    print(f"Accuracy within {margin*100}% margin: {accuracy:.2f}%")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\
Top 10 Most Important Features:")
print(feature_importance.head(10))

[0]	validation_0-rmse:30529.80408
[1]	validation_0-rmse:30238.96461
[2]	validation_0-rmse:29951.01826
[3]	validation_0-rmse:29665.67546
[4]	validation_0-rmse:29383.55302
[5]	validation_0-rmse:29103.52650
[6]	validation_0-rmse:28829.50370
[7]	validation_0-rmse:28555.55007
[8]	validation_0-rmse:28285.68379
[9]	validation_0-rmse:28017.05812
[10]	validation_0-rmse:27752.21571
[11]	validation_0-rmse:27489.68152
[12]	validation_0-rmse:27230.18828
[13]	validation_0-rmse:26972.37709
[14]	validation_0-rmse:26720.50287
[15]	validation_0-rmse:26468.18255
[16]	validation_0-rmse:26219.04056
[17]	validation_0-rmse:25975.72719
[18]	validation_0-rmse:25731.69857
[19]	validation_0-rmse:25489.37789
[20]	validation_0-rmse:25250.88660
[21]	validation_0-rmse:25016.60713
[22]	validation_0-rmse:24784.47698
[23]	validation_0-rmse:24553.08523
[24]	validation_0-rmse:24323.46376
[25]	validation_0-rmse:24096.30000
[26]	validation_0-rmse:23873.02359
[27]	validation_0-rmse:23650.59233
[28]	validation_0-rmse:23430.4

Hi! I see you have an airline dataset loaded. I can help you analyze flight pricing data and build predictive models. The dataset contains information about flight bookings including:

Flight details (airline, duration, stops)
Pricing
Route information
Booking and journey dates
Class types
Seasonal factors
I've initialized an XGBoost model to predict flight prices that achieved:

R² score of 0.965, indicating the model explains 96.5% of price variation
RMSE of ₹3,824, showing average prediction error
Price predictions within 5% margin of actual price for 41% of flights
Price predictions within 10% margin for 64% of flights
The most important factors affecting price were:

Economy class status
Business class status
Ticket class
First class status
Number of stops

In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define a function to train and evaluate models
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return r2, rmse

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=500, max_depth=15, random_state=42)
rf_r2, rf_rmse = train_and_evaluate_model(rf_model, X_train_scaled, X_test_scaled, y_train, y_test)

# Gradient Boosting Model
gb_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=10, random_state=42)
gb_r2, gb_rmse = train_and_evaluate_model(gb_model, X_train_scaled, X_test_scaled, y_train, y_test)



# Print results
print("Random Forest: R2 =", rf_r2, ", RMSE =", rf_rmse)
print("Gradient Boosting: R2 =", gb_r2, ", RMSE =", gb_rmse)


Random Forest: R2 = 0.9465376144659152 , RMSE = 4741.267611013511
Gradient Boosting: R2 = 0.96535059901992 , RMSE = 3816.96491163374
