# Importing the libraries:

In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score

# Importing the dataset:

In [46]:
dataset = pd.read_csv('../data/airlines_flights_data.csv')

In [47]:
dataset.head()

Unnamed: 0,index,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


# Check Missing values

In [48]:
dataset.isna().sum()

index               0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

# Check Duplicates

In [49]:
dataset.duplicated().sum()

np.int64(0)

In [231]:
# dataset = dataset.drop_duplicates().reset_index(drop=True)


# Dropping irrelevant features

In [50]:
dataset_selected = dataset.drop(columns=['index', 'flight'], axis=1)

In [51]:
dataset_selected.head()

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [52]:
dataset_selected.shape

(300153, 10)

In [None]:
# cols_to_encode = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
# cols_to_scale = ['duration', 'days_left']

# # Create copies to avoid modifying original
# df = dataset_selected.copy()

# # Step 1: Label Encode categorical columns
# le_dict = {}
# for col in cols_to_encode:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col])
#     le_dict[col] = le  # Save for inverse transform later

# # Step 2: Scale numerical columns
# scaler = StandardScaler()
# df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# # Step 3: Convert to NumPy array for model input
# X_encoded = df.values




In [None]:
# print(X_encoded)

[[ 4.00000000e+00  2.00000000e+00  2.00000000e+00 ... -1.39753079e+00
  -1.84387477e+00  5.95300000e+03]
 [ 4.00000000e+00  2.00000000e+00  1.00000000e+00 ... -1.37528380e+00
  -1.84387477e+00  5.95300000e+03]
 [ 0.00000000e+00  2.00000000e+00  1.00000000e+00 ... -1.39753079e+00
  -1.84387477e+00  5.95600000e+03]
 ...
 [ 5.00000000e+00  1.00000000e+00  1.00000000e+00 ...  2.23718366e-01
   1.69569214e+00  7.90990000e+04]
 [ 5.00000000e+00  1.00000000e+00  1.00000000e+00 ... -3.08818877e-01
   1.69569214e+00  8.15850000e+04]
 [ 5.00000000e+00  1.00000000e+00  4.00000000e+00 ... -2.97695384e-01
   1.69569214e+00  8.15850000e+04]]


In [None]:
# X_encoded.shape

(300153, 10)

In [None]:
# y = dataset.iloc[:, -1].values

In [None]:
# print(y)

[ 5953  5953  5956 ... 79099 81585 81585]


In [None]:
# print(dataset['Gender'].dtype, dataset.columns.get_loc('Gender'))
# print(dataset['Blood Type'].dtype, dataset.columns.get_loc('Blood Type'))
# print(dataset['Medical Condition'].dtype, dataset.columns.get_loc('Medical Condition'))
# print(dataset['Date of Admission'].dtype, dataset.columns.get_loc('Date of Admission'))
# print(dataset['Discharge Date'].dtype, dataset.columns.get_loc('Discharge Date'))
# print(dataset['Admission Type'].dtype, dataset.columns.get_loc('Admission Type'))
# print(dataset['Medication'].dtype, dataset.columns.get_loc('Medication'))

# print(dataset['Room Number'].dtype, dataset.columns.get_loc('Room Number'))



object 2
object 3
object 4
object 5
object 12
object 11
object 13
int64 10


In [57]:
# Creating Feature out of the Dataset:
X = dataset_selected.iloc[:, 0:-1].values
y = dataset_selected.iloc[:, -1].values

In [58]:
print(X)

[['SpiceJet' 'Delhi' 'Evening' ... 'Economy' 2.17 1]
 ['SpiceJet' 'Delhi' 'Early_Morning' ... 'Economy' 2.33 1]
 ['AirAsia' 'Delhi' 'Early_Morning' ... 'Economy' 2.17 1]
 ...
 ['Vistara' 'Chennai' 'Early_Morning' ... 'Business' 13.83 49]
 ['Vistara' 'Chennai' 'Early_Morning' ... 'Business' 10.0 49]
 ['Vistara' 'Chennai' 'Morning' ... 'Business' 10.08 49]]


# Encoding the Variable

In [63]:
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(sparse_output=False), [0, 1, 2, 3, 4, 5, 6]),
    ],
    remainder='passthrough'
)

X = np.array(ct.fit_transform(X))

In [64]:
print(X)

[[1.0 0.0 1.0 ... 0.6725755800138707 2.17 1]
 [1.0 0.0 1.0 ... 0.6725755800138707 2.33 1]
 [0.0 1.0 1.0 ... 0.6725755800138707 2.17 1]
 ...
 [1.0 0.0 1.0 ... -1.486821748683203 13.83 49]
 [1.0 0.0 1.0 ... -1.486821748683203 10.0 49]
 [1.0 0.0 1.0 ... -1.486821748683203 10.08 49]]


In [65]:
sc = StandardScaler()

X[:, -2:] = sc.fit_transform(X[:, -2:])

In [66]:
print(X)

[[1.0 0.0 1.0 ... 0.6725755800138707 -1.3975307863927666
  -1.8438747706865644]
 [1.0 0.0 1.0 ... 0.6725755800138707 -1.3752837997444147
  -1.8438747706865644]
 [0.0 1.0 1.0 ... 0.6725755800138707 -1.3975307863927666
  -1.8438747706865644]
 ...
 [1.0 0.0 1.0 ... -1.486821748683203 0.2237183656058788
  1.6956921424256621]
 [1.0 0.0 1.0 ... -1.486821748683203 -0.30881887728904506
  1.6956921424256621]
 [1.0 0.0 1.0 ... -1.486821748683203 -0.2976953839648691
  1.6956921424256621]]


In [67]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 1)

In [68]:
print(x_train)

[[1.0 0.0 1.0 ... 0.6725755800138707 -1.4100447163824645
  -0.2953142461999653]
 [1.0 0.0 1.0 ... -1.486821748683203 0.6992477052144008
  1.0320233462171196]
 [1.0 0.0 1.0 ... -1.486821748683203 0.722885128528275 1.1057643235736243]
 ...
 [1.0 0.0 1.0 ... 0.6725755800138707 -0.9108779534600686
  -0.2215732688434606]
 [1.0 0.0 0.0 ... -1.486821748683203 0.9884585316429755
  -1.7701337933300598]
 [1.0 0.0 1.0 ... 0.6725755800138707 -0.13501429409879578
  -1.4751698839040408]]


In [69]:
multi_linear_regressor = LinearRegression()

multi_linear_regressor.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [22]:
knn_classification_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classification_model.fit(x_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [None]:
svm_classifier = SVC(kernel = 'linear', random_state = 0)

svm_classifier.fit(x_train, y_train)

In [175]:
random_forest_classifier = RandomForestClassifier(n_estimators= 10, criterion= 'entropy', random_state= 0)

random_forest_classifier.fit(x_train, y_train)

0,1,2
,n_estimators,10
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
y_pred = multi_linear_regressor.predict(x_test)

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis = 1))

[[ 6047.46046075  6048.        ]
 [13420.46222934 13421.        ]
 [ 2835.4596667   2836.        ]
 ...
 [50472.47112678 50473.        ]
 [ 4098.46003867  4099.        ]
 [10966.46166547 10967.        ]]


In [70]:
CROSS_VAL_FOLDS = 5
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.28731135409672665
R^2 Score: 0.9999999994392481
