In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
data = pd.read_csv('transatBookedMore2.csv')

# 80-20 train test set
train = data.sample(frac=0.8, random_state=1)
test = data.drop(train.index)

x_train = train.drop('Revenue_Economy', axis=1)
y_train = train['Revenue_Economy']
x_test = test.drop('Revenue_Economy', axis=1)
y_test = test['Revenue_Economy']

mlr = LinearRegression()

def forward_stepwise_selection(x_train, y_train):
    remaining_predictors = list(x_train.columns)
    selected_predictors = []
    while remaining_predictors:
        best_rmse = float('inf')
        best_predictor = None

        for predictor in remaining_predictors:
            current_predictors = selected_predictors + [predictor]
            mlr.fit(x_train[current_predictors], y_train)
            y_pred = mlr.predict(x_train[current_predictors])
            rmse = mean_squared_error(y_train, y_pred, squared=False)

            if rmse < best_rmse:
                best_rmse = rmse
                best_predictor = predictor

        if best_predictor is not None:
            selected_predictors.append(best_predictor)
            remaining_predictors.remove(best_predictor)

    return selected_predictors

forward_selected_predictors = forward_stepwise_selection(x_train, y_train)

models = {
    'All predictors': list(x_train.columns),
    'Forward stepwise selection RMSE': forward_selected_predictors,
}

for name, predictors in models.items():
    mlr.fit(x_train[predictors], y_train)
    y_pred = mlr.predict(x_test[predictors])
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f'{name}: {rmse:.4f}')
    
    
    print(f'Predictors: {predictors}')
    
    
    print(f'Slope: {mlr.coef_}')
    
    
    sample_indices = np.random.choice(len(y_test), 10)
    for i in sample_indices:
        print(f'Predicted: {y_pred[i]:.2f}, Actual: {y_test.iloc[i]:.2f}')

All predictors: 3744.0154
Predictors: ['Booked_Q', 'Booked_X', 'Booked_O', 'Booked_V', 'Booked_E', 'Booked_P', 'Booked_N', 'Booked_U', 'Booked_W', 'Booked_M', 'Booked_L', 'Booked_I', 'Booked_H', 'Booked_T', 'Booked_Z', 'Booked_B', 'Booked_R', 'Booked_A', 'Booked_K', 'Booked_S', 'Booked_Y', 'Traffic', 'FlightNumber', 'Route', 'NDOWeek', 'Distance', 'Hour', 'DayOfWeek']
Slope: [ 2.02690719e+02  2.37292753e+02  1.85186958e+02  3.75148968e+02
  2.65938963e+02  3.77436939e+02  3.96184343e+02  5.04144856e+02
  3.20613634e+02  3.68994941e+02  4.47594143e+02  5.32924099e+02
  6.06808894e+02  6.32825655e+02  6.45307821e+02  7.91672452e+02
  7.19772900e+02  8.45699847e+02  9.24791504e+02  1.07029764e+03
  1.44811033e+03  3.92434415e+13  1.04036611e+13  9.21805968e+12
 -3.04583757e+01 -3.97629241e+10  7.69629182e+02  5.42608723e-01]
Predicted: 462.84, Actual: 0.00
Predicted: 2854.94, Actual: 2391.65
Predicted: 2437.56, Actual: 1953.75
Predicted: 19423.59, Actual: 18400.26
Predicted: 1003.62, Actu

In [15]:

data = pd.read_csv('transatBookedMore2.csv')
train_data, test_data = train_test_split(data, test_size=0.2)
x_train = train_data.drop('Revenue_Economy', axis=1)
y_train = train_data['Revenue_Economy']
mlr = LinearRegression()
mlr.fit(x_train, y_train)
print('Intercept:', mlr.intercept_)

for i, col in enumerate(x_train.columns):
    print(f'{col}: {mlr.coef_[i]}')

x_test = test_data.drop('Revenue_Economy', axis=1)
y_test = test_data['Revenue_Economy']

y_pred = mlr.predict(x_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print('RMSE:', rmse)

sample = test_data.sample(10)
sample['Predicted'] = mlr.predict(sample.drop('Revenue_Economy', axis=1))
print(sample[['Revenue_Economy', 'Predicted']])
print('RMSE:', rmse)

Intercept: -14839553365115.406
Booked_Q: 202.21849047280529
Booked_X: 237.6969524741147
Booked_O: 182.2985035060726
Booked_V: 377.5208950068854
Booked_E: 262.18080276655456
Booked_P: 375.515270128172
Booked_N: 397.8434187241796
Booked_U: 518.7506686011156
Booked_W: 315.83007020491135
Booked_M: 374.791269106823
Booked_L: 450.2354061376111
Booked_I: 523.1152645615355
Booked_H: 612.2091864935486
Booked_T: 630.248897651452
Booked_Z: 645.7352312881969
Booked_B: 777.8945762650446
Booked_R: 694.9084444371712
Booked_A: 862.365843156715
Booked_K: 913.9231729912802
Booked_S: 1070.056604956435
Booked_Y: 1466.3936693657472
Traffic: -3889612964902.9097
FlightNumber: -962284336325.1794
Route: -982522150001.1792
NDOWeek: -30.61432781555635
Distance: 3941101483.963692
Hour: 732.4727313502344
DayOfWeek: 7.5912467467357425
RMSE: 3785.7102687556076
        Revenue_Economy     Predicted
129742           188.72    840.066406
116426         90437.15  91859.808594
5819            3733.05   4456.539062
28770 