Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder

df = pd.read_csv('Clean_Dataset.csv')
label = LabelEncoder()
target = TargetEncoder()

df['airline'] = target.fit_transform(df['airline'],df['price'])
df['source_city'] = target.fit_transform(df['source_city'], df['price'])
df['destination_city'] = target.fit_transform(df['destination_city'],df['price'])
df["departure_time"]= target.fit_transform(df['departure_time'], df['price'])
df["arrival_time"]= target.fit_transform(df['arrival_time'], df['price'])
df['stops'] = df['stops'].map({'zero':0, 'one':1, 'two_or_more':2})
df['class'] = df['class'].map({'Economy':0, 'Business':1})

new_df = df.drop(["Unnamed: 0", "flight"], axis=1)
new_df = new_df.astype('float32')
print(new_df)


             airline   source_city  departure_time  stops  arrival_time  \
0        6179.278809  18951.326172    21232.361328    0.0  21586.757812   
1        6179.278809  18951.326172    20370.675781    0.0  22231.076172   
2        4091.072754  18951.326172    20370.675781    0.0  14993.139648   
3       30396.537109  18951.326172    21630.759766    0.0  18494.599609   
4       30396.537109  18951.326172    21630.759766    0.0  22231.076172   
...              ...           ...             ...    ...           ...   
300148  30396.537109  21995.339844    21630.759766    1.0  23044.371094   
300149  30396.537109  21995.339844    18179.203125    1.0  21586.757812   
300150  30396.537109  21995.339844    20370.675781    1.0  21586.757812   
300151  30396.537109  21995.339844    20370.675781    1.0  23044.371094   
300152  30396.537109  21995.339844    21630.759766    1.0  23044.371094   

        destination_city  class  duration  days_left    price  
0           21372.529297    0.0    

Splitting Data for Future Prediction

In [2]:
future_df = new_df.tail(15)
training_df = new_df.iloc[:-15]


Random Selection

In [4]:
selected_features = np.random.choice(training_df.columns[:-1], size = 5, replace=False)
x = training_df[selected_features] # Assign selected columns to x 
print(x.columns)
print(selected_features)
print(training_df[selected_features].shape)


Index(['stops', 'airline', 'class', 'source_city', 'days_left'], dtype='object')
['stops' 'airline' 'class' 'source_city' 'days_left']
(300138, 5)


Dataset Splitting

In [5]:
from sklearn.model_selection import train_test_split  
y = training_df['price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

Model 1: XGBOOST

In [6]:
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

# Initialize the XGBoost regressor
xgb_model = XGBRegressor(
    n_estimators=100,       # Number of trees
    max_depth=6,            # Maximum depth of trees
    learning_rate=0.1,      # Learning rate
    subsample=0.8,          # Subsampling ratio
    colsample_bytree=0.8,   # Feature subsampling ratio per tree
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
xg_acc = xgb_model.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Accuracy: {xg_acc:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")


Accuracy: 0.9454
RMSE: 5290.6157
R²: 0.9454


Model 2: Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn 

random_forest = RandomForestClassifier(n_estimators=5, max_depth=5,
    max_features='sqrt',
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42)
random_forest.fit(X_train, y_train)
random_accuracy = random_forest.score(X_test, y_test)
print(f"Accuracy: {random_accuracy:.4f}")


Accuracy: 0.1148


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

naive = GaussianNB()
naive.fit(X_train,y_train)
y_prediction = naive.predict(X_test)
accuracy = accuracy_score(X_test,y_test)
print(f"accuracy = : {accuracy:.4f}")
