In [1]:
import pandas as pd
import numpy as np
import re
import joblib
import json

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
flight = pd.read_csv(r"C:\Users\Lenovo\Downloads\scraped_data.csv")

print("Dataset shape:", flight.shape)
flight.head()

Dataset shape: (6492, 11)


Unnamed: 0,Airline Name,Aircraft Type,Departure Time,Departure Airport,Arrival Time,Arrival Airport,Class,Baggage Allowance,Price (NPR),Refundable Status,Date
0,YETI AIRLINES,ATR72,18:50,KATHMANDU,19:15,POKHARA,E1,15KG + 5KG,3895,NonRefundable,29-Aug
1,YETI AIRLINES,ATR72,19:20,KATHMANDU,19:45,POKHARA,E1,15KG + 5KG,3895,NonRefundable,29-Aug
2,YETI AIRLINES,ATR72,18:50,KATHMANDU,19:15,POKHARA,E1,15KG + 5KG,3900,NonRefundable,29-Aug
3,YETI AIRLINES,ATR72,19:20,KATHMANDU,19:45,POKHARA,E1,15KG + 5KG,3900,NonRefundable,29-Aug
4,SHREE AIRLINES,Q400,13:10,KATHMANDU,13:30,POKHARA,T,20 KG,4200,NonRefundable,29-Aug


In [3]:
flight.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6492 entries, 0 to 6491
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Airline Name       6492 non-null   object
 1   Aircraft Type      6492 non-null   object
 2   Departure Time     6492 non-null   object
 3   Departure Airport  6492 non-null   object
 4   Arrival Time       6492 non-null   object
 5   Arrival Airport    6492 non-null   object
 6   Class              6492 non-null   object
 7   Baggage Allowance  6492 non-null   object
 8   Price (NPR)        6492 non-null   int64 
 9   Refundable Status  6492 non-null   object
 10  Date               6492 non-null   object
dtypes: int64(1), object(10)
memory usage: 558.0+ KB


In [4]:
flight

Unnamed: 0,Airline Name,Aircraft Type,Departure Time,Departure Airport,Arrival Time,Arrival Airport,Class,Baggage Allowance,Price (NPR),Refundable Status,Date
0,YETI AIRLINES,ATR72,18:50,KATHMANDU,19:15,POKHARA,E1,15KG + 5KG,3895,NonRefundable,29-Aug
1,YETI AIRLINES,ATR72,19:20,KATHMANDU,19:45,POKHARA,E1,15KG + 5KG,3895,NonRefundable,29-Aug
2,YETI AIRLINES,ATR72,18:50,KATHMANDU,19:15,POKHARA,E1,15KG + 5KG,3900,NonRefundable,29-Aug
3,YETI AIRLINES,ATR72,19:20,KATHMANDU,19:45,POKHARA,E1,15KG + 5KG,3900,NonRefundable,29-Aug
4,SHREE AIRLINES,Q400,13:10,KATHMANDU,13:30,POKHARA,T,20 KG,4200,NonRefundable,29-Aug
...,...,...,...,...,...,...,...,...,...,...,...
6487,BUDDHA AIR,ATR,08:15,BHADRAPUR,09:00,KATHMANDU,Y,25 KG,12010,Refundable,12-Sep
6488,BUDDHA AIR,ATR,11:00,BHADRAPUR,11:45,KATHMANDU,Y,25 KG,12010,Refundable,12-Sep
6489,BUDDHA AIR,ATR,12:40,BHADRAPUR,13:25,KATHMANDU,Y,25 KG,12010,Refundable,12-Sep
6490,BUDDHA AIR,ATR,14:05,BHADRAPUR,14:50,KATHMANDU,Y,25 KG,12010,Refundable,12-Sep


In [5]:
def time_to_minutes(t):
    h, m = map(int, t.split(':'))
    return h * 60 + m

flight['Departure_Minutes'] = flight['Departure Time'].apply(time_to_minutes)
flight['Arrival_Minutes'] = flight['Arrival Time'].apply(time_to_minutes)

In [6]:
def baggage_to_kg(b):
    b = str(b).upper().strip()

    # Case 1: "15KG + 5KG"
    if "+" in b:
        nums = re.findall(r"(\d+\.?\d*)\s*KG", b)
        return sum(float(x) for x in nums)

    # Case 2: "30 KG(Luggage) 7 KG(Hand Carry)"
    if "LUGGAGE" in b or "HAND" in b:
        nums = re.findall(r"(\d+\.?\d*)\s*KG", b)
        return sum(float(x) for x in nums)

    # Case 3: "25 KG" or "25.00 KG"
    if "KG" in b:
        num = re.findall(r"(\d+\.?\d*)", b)
        return float(num[0]) if num else 0

    # Case 4: "1 Piece" / "2 Pieces"
    if "PIECE" in b:
        pcs = re.findall(r"(\d+)", b)
        return int(pcs[0]) * 23 if pcs else 0

    return 0

# Apply on your column
flight["Baggage_KG"] = flight["Baggage Allowance"].apply(baggage_to_kg)

In [7]:
flight['Refundable Status'] = flight['Refundable Status'].map({'Refundable': 1, 'NonRefundable': 0})

In [8]:
flight['Date'] = pd.to_datetime(flight['Date'], format='%d-%b')
flight['Day'] = flight['Date'].dt.day
flight['Month'] = flight['Date'].dt.month
flight.drop('Date', axis=1, inplace=True)


In [9]:
flight.drop(['Departure Time', 'Arrival Time', 'Baggage Allowance'], axis=1, inplace=True)

In [10]:
# Handle missing values first
flight['Refundable Status'].fillna(flight['Refundable Status'].mode()[0], inplace=True)

# Split data FIRST before encoding
X = flight.drop('Price (NPR)', axis=1)
y = flight['Price (NPR)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical variables separately for train and test
categorical_cols = ['Airline Name', 'Aircraft Type', 'Departure Airport', 
                    'Arrival Airport', 'Class']

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Get feature names
feature_names = encoder.get_feature_names_out(categorical_cols)

# Create DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=feature_names, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=feature_names, index=X_test.index)

# Drop original categorical columns and concatenate with encoded ones
X_train = X_train.drop(categorical_cols, axis=1)
X_test = X_test.drop(categorical_cols, axis=1)

X_train_final = pd.concat([X_train, X_train_encoded_df], axis=1)
X_test_final = pd.concat([X_test, X_test_encoded_df], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flight['Refundable Status'].fillna(flight['Refundable Status'].mode()[0], inplace=True)


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [None]:
flight.info()