In [None]:
# 1. Imports
# ==============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif,RFE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle

In [None]:
# 2. Load Dataset
# ==============================================
dataset = pd.read_csv("/content/flight_prediction.csv")

In [None]:
x = dataset[['duration', 'days_left', 'airline', 'source_city',
             'departure_time', 'stops', 'arrival_time',
             'destination_city', 'class']]
y = dataset['price']

In [None]:
# 3. Convert Target to Classes
# ==============================================
y_class = pd.qcut(y, q=3, labels=["Low", "Medium", "High"])

In [None]:
# 4. Encode Categorical Variables
# ==============================================
x_encoded = pd.get_dummies(x, drop_first=True)

In [None]:
# 5. Feature Scaling
# ==============================================
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_encoded)


In [None]:
# 6. Train-Test Split
# ==============================================
x_train, x_test, y_train, y_test = train_test_split(
    x_scaled, y_class, test_size=0.3, random_state=0
)

In [None]:
# 7a. Feature Selection using SelectKBest
# ==============================================
selector_kbest = SelectKBest(score_func=f_classif, k=10)
x_train_kbest = selector_kbest.fit_transform(x_train, y_train)
x_test_kbest = selector_kbest.transform(x_test)
selected_features_kbest = x_encoded.columns[selector_kbest.get_support()]
print("\nSelected Features using SelectKBest:", list(selected_features_kbest))


Selected Features using SelectKBest: ['duration', 'days_left', 'airline_Air_India', 'airline_GO_FIRST', 'airline_Indigo', 'airline_SpiceJet', 'airline_Vistara', 'stops_two_or_more', 'stops_zero', 'class_Economy']


In [None]:
# 7b. Feature Selection using RFE
# ==============================================
rfe_selector = RFE(estimator=LogisticRegression(max_iter=100), n_features_to_select=10)
x_train_rfe = rfe_selector.fit_transform(x_train, y_train)
x_test_rfe = rfe_selector.transform(x_test)
selected_features_rfe = x_encoded.columns[rfe_selector.get_support()]
print("\nSelected Features using RFE:", list(selected_features_rfe))


Selected Features using RFE: ['days_left', 'airline_Air_India', 'airline_GO_FIRST', 'airline_Indigo', 'airline_Vistara', 'source_city_Kolkata', 'stops_two_or_more', 'stops_zero', 'destination_city_Kolkata', 'class_Economy']


In [None]:
# 8a. Logistic Regression (SelectKBest)
# ==============================================
log_reg_kbest = LogisticRegression(max_iter=500)
log_reg_kbest.fit(x_train_kbest, y_train)
y_pred_log_kbest = log_reg_kbest.predict(x_test_kbest)

print("\n Logistic Regression (SelectKBest) Results")
print("Accuracy:", accuracy_score(y_test, y_pred_log_kbest))
print("Precision:", precision_score(y_test, y_pred_log_kbest, average='macro'))
print("Recall:", recall_score(y_test, y_pred_log_kbest, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred_log_kbest, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_kbest))


 Logistic Regression (SelectKBest) Results
Accuracy: 0.7867756480021323
Precision: 0.7933603628300103
Recall: 0.7867931409826432
F1 Score: 0.7895568859001875
Confusion Matrix:
 [[28142    88  1778]
 [    0 21466  8701]
 [   53  8580 21238]]


In [None]:
# 8b. Logistic Regression (RFE)
# ==============================================
log_reg_rfe = LogisticRegression(max_iter=500)
log_reg_rfe.fit(x_train_rfe, y_train)
y_pred_log_rfe = log_reg_rfe.predict(x_test_rfe)

print("\n Logistic Regression (RFE) Results")
print("Accuracy:", accuracy_score(y_test, y_pred_log_rfe))
print("Precision:", precision_score(y_test, y_pred_log_rfe, average='macro'))
print("Recall:", recall_score(y_test, y_pred_log_rfe, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred_log_rfe, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_rfe))


 Logistic Regression (RFE) Results
Accuracy: 0.8041556537769584
Precision: 0.8103418483972513
Recall: 0.8041852386810359
F1 Score: 0.8066749434970818
Confusion Matrix:
 [[28141    88  1779]
 [    1 22123  8043]
 [   69  7655 22147]]


In [None]:
# 9. KNN Classifier (using RFE)
# ==============================================
knn_params = {'n_neighbors': [3 ]}
knn_rfe = GridSearchCV(KNeighborsClassifier(), knn_params,
                       cv=2, scoring='accuracy', n_jobs=-1, verbose=1)
knn_rfe.fit(x_train_rfe, y_train)
y_pred_knn_rfe = knn_rfe.predict(x_test_rfe)

print("\nKNN (RFE) Results")
print("Best Params:", knn_rfe.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_knn_rfe))
print("F1 Score:", f1_score(y_test, y_pred_knn_rfe, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn_rfe))

Fitting 2 folds for each of 1 candidates, totalling 2 fits

KNN (RFE) Results
Best Params: {'n_neighbors': 3}
Accuracy: 0.8139728583168603
F1 Score: 0.8147423214984434
Confusion Matrix:
 [[28570   110  1328]
 [   73 22934  7160]
 [  723  7357 21791]]


In [None]:
# 10. Random Forest Classifier (RFE)
# ==============================================
rf_params = {'n_estimators': [100], 'max_depth': [None, 10]}
rf_rfe = GridSearchCV(RandomForestClassifier(random_state=0),
                      rf_params, cv=2, scoring='accuracy',
                      n_jobs=-1, verbose=1)
rf_rfe.fit(x_train_rfe, y_train)
y_pred_rf_rfe = rf_rfe.predict(x_test_rfe)

print("\n Random Forest (RFE) Results")
print("Best Params:", rf_rfe.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_rf_rfe))
print("F1 Score:", f1_score(y_test, y_pred_rf_rfe, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_rfe))

Fitting 2 folds for each of 2 candidates, totalling 4 fits

 Random Forest (RFE) Results
Best Params: {'max_depth': 10, 'n_estimators': 100}
Accuracy: 0.8432356795415676
F1 Score: 0.8422633192244513
Confusion Matrix:
 [[28348    82  1578]
 [    0 27379  2788]
 [  148  9520 20203]]


In [None]:
# Random Forest Classifier (SelectKBest)
# ==============================================
rf_params = {'n_estimators': [100], 'max_depth': [None, 10]}
rf_kbest = GridSearchCV(RandomForestClassifier(random_state=0),
                        rf_params, cv=2, scoring='accuracy',
                        n_jobs=-1, verbose=1)
rf_kbest.fit(x_train_kbest, y_train)
y_pred_rf_kbest = rf_kbest.predict(x_test_kbest)

print("\n Random Forest (SelectKBest) Results")
print("Best Params:", rf_kbest.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_rf_kbest))
print("F1 Score:", f1_score(y_test, y_pred_rf_kbest, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_kbest))

Fitting 2 folds for each of 2 candidates, totalling 4 fits

 Random Forest (SelectKBest) Results
Best Params: {'max_depth': 10, 'n_estimators': 100}
Accuracy: 0.8300757390666993
F1 Score: 0.8316743844630737
Confusion Matrix:
 [[28381    81  1546]
 [    0 23845  6322]
 [  171  7181 22519]]


In [None]:
# Save the Best SelectKBest Random Forest Model
# ==============================================
best_model_kbest = rf_kbest.best_estimator_
with open("Finalized_FlightPrice_RF_SelectKBest_Classifier.pkl", "wb") as f:
    pickle.dump(best_model_kbest, f)

print("\nFinal Random Forest (SelectKBest) Classifier saved as Finalized_FlightPrice_RF_SelectKBest_Classifier.pkl")


Final Random Forest (SelectKBest) Classifier saved as Finalized_FlightPrice_RF_SelectKBest_Classifier.pkl


In [None]:
#KNN (SelectKBest)
# GridSearch parameters
knn_params = {'n_neighbors': [3 ]}
# Train KNN using SelectKBest features
knn_kbest = GridSearchCV(KNeighborsClassifier(),
                         knn_params,
                         cv=2,
                         scoring='accuracy',
                         n_jobs=-1,
                         verbose=1)
knn_kbest.fit(x_train_kbest, y_train)
# Predictions
y_pred_knn_kbest = knn_kbest.predict(x_test_kbest)
# Results
print("\n KNN Classifier (SelectKBest) Results")
print("Best Params:", knn_kbest.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_knn_kbest))
print("F1 Score:", f1_score(y_test, y_pred_knn_kbest, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn_kbest))

Fitting 2 folds for each of 1 candidates, totalling 2 fits

 KNN Classifier (SelectKBest) Results
Best Params: {'n_neighbors': 3}
Accuracy: 0.796703906892033
F1 Score: 0.7974526274261883
Confusion Matrix:
 [[28573    82  1353]
 [   80 21988  8099]
 [  845  7847 21179]]


In [None]:
# Save KNN model
import pickle
best_knn_kbest = knn_kbest.best_estimator_
with open("Finalized_FlightPrice_KNN_SelectKBest_Classifier.pkl", "wb") as f:pickle.dump(best_knn_kbest, f)
print("\n KNN (SelectKBest) Classifier saved as Finalized_FlightPrice_KNN_SelectKBest_Classifier.pkl")


 KNN (SelectKBest) Classifier saved as Finalized_FlightPrice_KNN_SelectKBest_Classifier.pkl


In [None]:
# SVM Classifier
# ==============================================
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:
# GridSearch parameters for SVM
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

In [None]:
# 1. SVM with SelectKBest Features
# ==============================================
# GridSearch parameters for SVM
svm_params = {
    'C': [0.1, 1],
    'kernel': ['linear'],
    'gamma': ['scale']}
# NOTE: Make sure to run the cell defining x_train_kbest and y_train first (cell QYKrEUxcU7m0)
svm_kbest = GridSearchCV(SVC(),svm_params,cv=2,scoring='accuracy',n_jobs=-1,verbose=1)
svm_kbest.fit(x_train_kbest, y_train)
y_pred_svm_kbest = svm_kbest.predict(x_test_kbest)

print("\n SVM Classifier (SelectKBest) Results")
print("Best Params:", svm_kbest.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_svm_kbest))
print("F1 Score:", f1_score(y_test, y_pred_svm_kbest, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm_kbest))

NameError: name 'x_train_kbest' is not defined

In [None]:
# Save model
with open("Finalized_FlightPrice_SVM_SelectKBest_Classifier.pkl", "wb") as f:pickle.dump(svm_kbest.best_estimator_, f)
print("SVM (SelectKBest) Classifier saved as Finalized_FlightPrice_SVM_SelectKBest_Classifier.pkl")

In [None]:
# 2. SVM with RFE Features
# ==============================================
svm_rfe = GridSearchCV(SVC(),
                       svm_params,
                       cv=3,
                       scoring='accuracy',
                       n_jobs=-1,
                       verbose=1)
svm_rfe.fit(x_train_rfe, y_train)
y_pred_svm_rfe = svm_rfe.predict(x_test_rfe)

In [None]:
print("\n SVM Classifier (RFE) Results")
print("Best Params:", svm_rfe.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_svm_rfe))
print("F1 Score:", f1_score(y_test, y_pred_svm_rfe, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm_rfe))

# Save model
with open("Finalized_FlightPrice_SVM_RFE_Classifier.pkl", "wb") as f:pickle.dump(svm_rfe.best_estimator_, f)
print(" SVM (RFE) Classifier saved as Finalized_FlightPrice_SVM_RFE_Classifier.pkl")