# Load and Preprocess the Data

In [1]:
#Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Load the data
file_path = '../data/cleaned/AirlineData.csv'
df = pd.read_csv(file_path)
print("Data loaded successfully.")
print(df.head())

Data loaded successfully.
             Airline                Source     Destination  Number of Stops  \
0     Etihad Airways  Toronto Pearson Intl  Bengaluru Intl                1   
1              Delta  Toronto Pearson Intl  Bengaluru Intl                1   
2  Multiple Airlines  Toronto Pearson Intl  Bengaluru Intl                2   
3  Multiple Airlines  Toronto Pearson Intl  Bengaluru Intl                2   
4  Multiple Airlines  Toronto Pearson Intl  Bengaluru Intl                2   

           Class        Date  Total_Stopover_Time  price in CAD  days_left  \
0  Economy Class  2024-06-02                  130        2340.0          1   
1  Economy Class  2024-06-02                 1335        1347.0          1   
2  Economy Class  2024-06-02                  420        1934.0          1   
3  Economy Class  2024-06-02                  380        2291.0          1   
4  Economy Class  2024-06-02                  175        2661.0          1   

  Departure_24hr Arrival_24hr 

In [3]:
# Handle missing values
df.fillna(method='ffill', inplace=True)
# Define features and target
X = df.drop('price in CAD', axis=1)
y = df['price in CAD']
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train the XtraTreesRegressor Model

In [4]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [5]:
# Define the model
model = ExtraTreesRegressor(n_estimators=10, random_state=42)
# Create the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
# Train the model
clf.fit(X_train, y_train)
print("Model training complete.")

Model training complete.


In [6]:
# Predict on the test data
y_pred = clf.predict(X_test)
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print("Model evaluation complete.")
print("Mean Absolute Error (MAE): ", mae)
print("Mean Squared Error (MSE): ", mse)
print("Root Mean Squared Error (RMSE): ", rmse)
print("R-squared (R2): ", r2)

Model evaluation complete.
Mean Absolute Error (MAE):  228.3940247364208
Mean Squared Error (MSE):  409006.4073156972
Root Mean Squared Error (RMSE):  639.5360875788771
R-squared (R2):  0.9058946588304023


