In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
cd

C:\Users\37789


In [3]:
df = pd.read_csv('AirlineData.csv')

In [4]:
df.isnull().sum()

Airline                0
Source                 0
Destination            0
Number of Stops        0
Class                  0
Date                   0
Total_Stopover_Time    0
price in CAD           0
days_left              0
Departure_24hr         0
Arrival_24hr           0
Arrival_Day_Offset     0
dtype: int64

In [5]:
df.select_dtypes(include = ["object"]).columns

Index(['Airline', 'Source', 'Destination', 'Class', 'Date', 'Departure_24hr',
       'Arrival_24hr'],
      dtype='object')

In [6]:
class_mapping = {
    'First Class': 4,
    'Business Class': 3,
    'Premium Economy': 2,
    'Economy Class': 1,
    'Other': 0
}
df['Class'] = df['Class'].map(class_mapping)
print(df.head())

             Airline                Source     Destination  Number of Stops  \
0     Etihad Airways  Toronto Pearson Intl  Bengaluru Intl                1   
1              Delta  Toronto Pearson Intl  Bengaluru Intl                1   
2  Multiple Airlines  Toronto Pearson Intl  Bengaluru Intl                2   
3  Multiple Airlines  Toronto Pearson Intl  Bengaluru Intl                2   
4  Multiple Airlines  Toronto Pearson Intl  Bengaluru Intl                2   

   Class        Date  Total_Stopover_Time  price in CAD  days_left  \
0      1  2024-06-02                  130        2340.0          1   
1      1  2024-06-02                 1335        1347.0          1   
2      1  2024-06-02                  420        1934.0          1   
3      1  2024-06-02                  380        2291.0          1   
4      1  2024-06-02                  175        2661.0          1   

  Departure_24hr Arrival_24hr  Arrival_Day_Offset  
0          22:10        03:05                   2  


In [7]:
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

df.drop(columns=['Date'], inplace=True)

In [8]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_array = encoder.fit_transform(df[['Source', 'Destination']])
feature_names = encoder.get_feature_names_out(['Source', 'Destination'])
encoded_df = pd.DataFrame(encoded_array, columns=feature_names)
df = df.join(encoded_df)
df.drop(['Source', 'Destination'], axis=1, inplace=True)

In [9]:
def time_to_minutes(time_str):
    hours, minutes = map(int, time_str.split(':'))
    return hours * 60 + minutes
df['Departure_minutes'] = df['Departure_24hr'].apply(time_to_minutes)
df['Arrival_minutes'] = df['Arrival_24hr'].apply(time_to_minutes)
df.drop(['Departure_24hr', 'Arrival_24hr'], axis=1, inplace=True)

In [10]:
unique_airlines = df['Airline'].unique()
for airline in unique_airlines:
    df[f'Airline_{airline}'] = (df['Airline'] == airline).astype(int)
df.drop(columns=['Airline'], inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384663 entries, 0 to 384662
Data columns (total 80 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   Number of Stops                               384663 non-null  int64  
 1   Class                                         384663 non-null  int64  
 2   Total_Stopover_Time                           384663 non-null  int64  
 3   price in CAD                                  384663 non-null  float64
 4   days_left                                     384663 non-null  int64  
 5   Arrival_Day_Offset                            384663 non-null  int64  
 6   Year                                          384663 non-null  int32  
 7   Month                                         384663 non-null  int32  
 8   Day                                           384663 non-null  int32  
 9   Source_Bengaluru Intl                         38

In [12]:
features = df.drop(columns=['price in CAD'])
target = df['price in CAD']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
model = RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')

Mean Absolute Error: 289.887989496201


In [16]:
noise = np.random.normal(0, 1, X_test_scaled.shape)
X_test_noisy_scaled = X_test_scaled + noise
y_pred_noisy = model.predict(X_test_noisy_scaled)
print(f'MAE with noise: {mean_absolute_error(y_test, y_pred_noisy)}')

MAE with noise: 1649.3808665387817


In [17]:
X_test_missing = X_test.copy()
X_test_missing.iloc[:, 0] = np.nan 

In [18]:
X_test_missing_filled = X_test_missing.fillna(X_train.mean())
X_test_missing_scaled = scaler.transform(X_test_missing_filled)
y_pred_missing = model.predict(X_test_missing_scaled)
print(f'MAE with missing data: {mean_absolute_error(y_test, y_pred_missing)}')

MAE with missing data: 413.9980636654436


### The MAE with missing data being 413.99806366544357 indicates that your model is performing reasonably well even with some missing values in the data. This MAE value suggests that the imputation method or handling of missing data is effective in maintaining model performance.

In [36]:
X_test_boundary = X_test.copy()
X_test_boundary.iloc[:, 0] = X_test.iloc[:, 0].max()

In [37]:
X_test_boundary_scaled = scaler.transform(X_test_boundary)
y_pred_boundary = model.predict(X_test_boundary_scaled)
print(f'MAE with boundary values: {mean_absolute_error(y_test, y_pred_boundary)}')

MAE with boundary values: 840.956110185973


### The MAE with boundary values of 840.956110185973 indicates that your model is still performing reasonably well when tested with extreme or boundary values in the input data. This error metric suggests that the model is able to generalize to these challenging cases to a certain extent, though the error is higher compared to standard data cases.

In [38]:
X_large = pd.concat([X_test]*100, ignore_index=True)
y_large = pd.concat([y_test]*100, ignore_index=True)
X_large_scaled = scaler.transform(X_large)
start_time = time.time()
y_pred_large = model.predict(X_large_scaled)
end_time = time.time()
print(f'MAE with large dataset: {mean_absolute_error(y_large, y_pred_large)}')
print(f'Time taken for large dataset: {end_time - start_time} seconds')

MAE with large dataset: 289.8879894962012
Time taken for large dataset: 10.46093225479126 seconds


### The results indicate that when your model was tested on a large dataset, it achieved an MAE of 289.8879894962012 and took 10.46093225479126 seconds to make predictions. This performance suggests that your model scales well with larger amounts of data, maintaining a relatively low error rate and making predictions efficiently within a reasonable time frame.

In [39]:
def make_new_data():
    X_new = X_train.sample(frac=0.1, random_state=42) 
    y_new = y_train.loc[X_new.index]
    return X_new, y_new

In [40]:
X_new, y_new = make_new_data()

In [41]:
X_new_scaled = scaler.fit_transform(X_new)

In [42]:
model.fit(X_new_scaled, y_new)

In [44]:
X_test_scaled = scaler.transform(X_test)  
y_pred_new = model.predict(X_test_scaled)
print(f'MAE after retraining: {mean_absolute_error(y_test, y_pred_new)}')

MAE after retraining: 535.899605311391


### The MAE after retraining your model is 535.899605311391, which indicates a slight increase in error compared to the initial training phase. This is relatively common when retraining models with new data, as the model may need to adjust to different patterns or distributions in the data.

In [45]:
def document_results(**results):
    for key, value in results.items():
        print(f'{key}: {value}')

document_results(
    original_mae=mean_absolute_error(y_test, y_pred),
    noise_mae=mean_absolute_error(y_test, y_pred_noisy),
    missing_data_mae=mean_absolute_error(y_test, y_pred_missing),
    boundary_values_mae=mean_absolute_error(y_test, y_pred_boundary),
    large_dataset_mae=mean_absolute_error(y_large, y_pred_large),
    large_dataset_time=end_time - start_time,
    retraining_mae=mean_absolute_error(y_test, y_pred_new)
)


original_mae: 289.887989496201
noise_mae: 1648.881344721643
missing_data_mae: 413.99806366544357
boundary_values_mae: 840.956110185973
large_dataset_mae: 289.8879894962012
large_dataset_time: 10.46093225479126
retraining_mae: 535.899605311391
