### 1. Data Preprocessing 

#### 1.1 Import necessary packages

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.special import inv_boxcox
from statsmodels.stats.diagnostic import normal_ad
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
from scipy import stats

#### 1.2 Import dataset

In [2]:
df = pd.read_csv('./data/Flight_Prices.csv', index_col=0) #load the dataset into your jupyter notebook

1.4.2 Label Encoding

In [3]:
# Label Encoding to convert categorical variables into numerical variables
# A MultiColumnLabelEncoder is defined to perform Label Encoding over multiple columns instead of one.
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() #Initalization as shortcut 'le'

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = le.fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = le.fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)


In [4]:
df1 = MultiColumnLabelEncoder(columns = ['airline','flight', 'source_city', 'departure_time', 'stops',
                                         'arrival_time', 'destination_city', 'class' ]).fit_transform(df)

df1.head(5)


Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,4,1408,2,2,2,5,5,1,2.17,1,5953
1,4,1387,2,1,2,4,5,1,2.33,1,5953
2,0,1213,2,1,2,1,5,1,2.17,1,5956
3,5,1559,2,4,2,0,5,1,2.25,1,5955
4,5,1549,2,4,2,4,5,1,2.33,1,5955


In [5]:
from sklearn.preprocessing import MinMaxScaler


min_max = MinMaxScaler()
df_scaled = min_max.fit_transform(df1)
column_headers = ['airline', 
                  'flight', 
                  'source_city',
                  'departure_time',
                  'stops',
                  'arrival_time',
                  'destination_city',
                  'class',
                  'duration',
                  'days_left',
                  'price']
df_scaled = pd.DataFrame(df_scaled, columns=column_headers)
df_scaled.head(5)


Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0.8,0.902564,0.4,0.4,1.0,1.0,1.0,1.0,0.027347,0.0,0.039749
1,0.8,0.889103,0.4,0.2,1.0,0.8,1.0,1.0,0.030612,0.0,0.039749
2,0.0,0.777564,0.4,0.2,1.0,0.2,1.0,1.0,0.027347,0.0,0.039773
3,1.0,0.999359,0.4,0.8,1.0,0.0,1.0,1.0,0.02898,0.0,0.039765
4,1.0,0.992949,0.4,0.8,1.0,0.8,1.0,1.0,0.030612,0.0,0.039765


## 2. Regression

In [6]:
# Setting the target variable 

X = df_scaled.iloc[:, :-1] # as input variables
y = df_scaled.iloc[:, -1] # as target variable (label)

In [7]:
X

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,0.8,0.902564,0.4,0.4,1.0,1.0,1.0,1.0,0.027347,0.0
1,0.8,0.889103,0.4,0.2,1.0,0.8,1.0,1.0,0.030612,0.0
2,0.0,0.777564,0.4,0.2,1.0,0.2,1.0,1.0,0.027347,0.0
3,1.0,0.999359,0.4,0.8,1.0,0.0,1.0,1.0,0.028980,0.0
4,1.0,0.992949,0.4,0.8,1.0,0.8,1.0,1.0,0.030612,0.0
...,...,...,...,...,...,...,...,...,...,...
300148,1.0,0.946795,0.2,0.8,0.0,0.4,0.6,0.0,0.188776,1.0
300149,1.0,0.949359,0.2,0.0,0.0,1.0,0.6,0.0,0.195714,1.0
300150,1.0,0.952564,0.2,0.2,0.0,1.0,0.6,0.0,0.265306,1.0
300151,1.0,0.950641,0.2,0.2,0.0,0.4,0.6,0.0,0.187143,1.0


In [8]:
y

0         0.039749
1         0.039749
2         0.039773
3         0.039765
4         0.039765
            ...   
300148    0.558844
300149    0.623124
300150    0.639473
300151    0.659856
300152    0.659856
Name: price, Length: 300153, dtype: float64

### Multiple Linear Regression

In [9]:
# Splitting the data into training and test dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
#Polynomial Regression uses Linear Regression function to optimize linear coefficients

from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

LinearRegression()

In [11]:
pd.DataFrame(data = np.append(linear_reg.intercept_ , linear_reg.coef_), 
             index = ['Intercept']+[col+" Coef." for col in X.columns], 
             columns=['Coefficient Value']).sort_values('Coefficient Value', ascending=False)

Unnamed: 0,Coefficient Value
Intercept,0.404063
airline Coef.,0.037717
duration Coef.,0.036414
arrival_time Coef.,0.00758
destination_city Coef.,0.005023
source_city Coef.,0.004345
flight Coef.,0.003167
departure_time Coef.,0.001452
days_left Coef.,-0.051849
stops Coef.,-0.053876


####  Polynomial Regression

In [12]:
#firstly try n-degree of 2
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
X_poly = pd.DataFrame(X_poly, columns=poly_features.get_feature_names_out(X.columns))
X_poly.head(5)

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,...,destination_city^2,destination_city class,destination_city duration,destination_city days_left,class^2,class duration,class days_left,duration^2,duration days_left,days_left^2
0,0.8,0.902564,0.4,0.4,1.0,1.0,1.0,1.0,0.027347,0.0,...,1.0,1.0,0.027347,0.0,1.0,0.027347,0.0,0.000748,0.0,0.0
1,0.8,0.889103,0.4,0.2,1.0,0.8,1.0,1.0,0.030612,0.0,...,1.0,1.0,0.030612,0.0,1.0,0.030612,0.0,0.000937,0.0,0.0
2,0.0,0.777564,0.4,0.2,1.0,0.2,1.0,1.0,0.027347,0.0,...,1.0,1.0,0.027347,0.0,1.0,0.027347,0.0,0.000748,0.0,0.0
3,1.0,0.999359,0.4,0.8,1.0,0.0,1.0,1.0,0.02898,0.0,...,1.0,1.0,0.02898,0.0,1.0,0.02898,0.0,0.00084,0.0,0.0
4,1.0,0.992949,0.4,0.8,1.0,0.8,1.0,1.0,0.030612,0.0,...,1.0,1.0,0.030612,0.0,1.0,0.030612,0.0,0.000937,0.0,0.0


In [13]:
#With the construction of second-order features, the number of dataframe features increased to 65
poly_features_names = poly_features.get_feature_names_out(X.columns)
len(poly_features_names)

65

In [14]:
#training and testing data for polynomial regression
X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly, y, 
                                                                        test_size=0.3, 
                                                                        random_state=0)

In [15]:
print('X_poly_train shape: ', X_poly_train.shape)
print('X_poly_test shape: ', X_poly_test.shape)
print('y_poly_train shape: ', y_poly_train.shape)
print('y_poly_test shape: ',y_poly_test.shape)

X_poly_train shape:  (210107, 65)
X_poly_test shape:  (90046, 65)
y_poly_train shape:  (210107,)
y_poly_test shape:  (90046,)


### Create Polynomial Regression 

In [16]:
# Training the Polynomial Regression model 
# we use Linear Regression because linear coefficients are to be optimised

polynomial_reg = LinearRegression()
polynomial_reg.fit(X_poly_train, y_poly_train)

LinearRegression()

In [17]:
from sklearn import metrics

def model_evaluation(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    
    MAE = metrics.mean_absolute_error(y_test, y_pred)
    MSE = metrics.mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(MSE)
    R2_Score = metrics.r2_score(y_test, y_pred)
    
    return pd.DataFrame([MAE, MSE, RMSE, R2_Score], index=['MAE', 'MSE', 'RMSE' ,'R2-Score'], 
                        columns=[model_name])

In [18]:
# store the prediction results also beyond the method in the previous line
# we need this to be able to write/print the results (see below print results)
y_pred = polynomial_reg.predict(X_poly_test)

### Model Evaluation 

In [19]:
#Performance on test-data
model_evaluation(polynomial_reg, X_poly_test, y_poly_test, 'Polynomial Reg. Test')

Unnamed: 0,Polynomial Reg. Test
MAE,0.028851
MSE,0.002153
RMSE,0.046399
R2-Score,0.937869


In [20]:
#Performance on training-data
# Similar values no sign regarding overfitting
model_evaluation(polynomial_reg, X_poly_train, y_poly_train, 'Polynomial Reg. Train')

Unnamed: 0,Polynomial Reg. Train
MAE,0.028989
MSE,0.002175
RMSE,0.046638
R2-Score,0.937181


In [21]:
# Cross validation

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate

pipeline = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())

kf = KFold(n_splits=6, shuffle=True, random_state=0) 
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2']
result2 = cross_validate(pipeline, X, y, cv=kf, return_train_score=True, scoring=scoring)

MAE_mean = (-result2['test_neg_mean_absolute_error']).mean()
MAE_std = (-result2['test_neg_mean_absolute_error']).std()
MSE_mean = (-result2['test_neg_mean_squared_error']).mean()
MSE_std = (-result2['test_neg_mean_squared_error']).std()
RMSE_mean = (-result2['test_neg_root_mean_squared_error']).mean()
RMSE_std = (-result2['test_neg_root_mean_squared_error']).std()
R2_Score_mean = result2['test_r2'].mean()
R2_Score_std = result2['test_r2'].std()

pd.DataFrame({'Mean': [MAE_mean,MSE_mean,RMSE_mean,R2_Score_mean], 'Std': [MAE_std,MSE_std,RMSE_std, 
                                                                           R2_Score_std]},
             index=['MAE', 'MSE', 'RMSE' ,'R2-Score'])

Unnamed: 0,Mean,Std
MAE,0.028972,0.000208
MSE,0.002169,3.5e-05
RMSE,0.046574,0.000376
R2-Score,0.937363,0.000811


### Check Results 

In [22]:
y_poly_test

44712     0.078842
233644    0.539683
121467    0.047587
185846    0.009560
163599    0.026712
            ...   
119620    0.029246
239958    0.366496
39577     0.026327
284720    0.415804
214871    0.339045
Name: price, Length: 90046, dtype: float64

In [23]:
y_pred

array([0.08001846, 0.45853317, 0.02562749, ..., 0.02719159, 0.46115403,
       0.3732037 ])

In [24]:
# Comparison of real value and predicted value

pd.DataFrame({'Y_Test': y_poly_test,'Y_Pred':y_pred, 'Residuals':(y_poly_test-y_pred)})

Unnamed: 0,Y_Test,Y_Pred,Residuals
44712,0.078842,0.080018,-0.001177
233644,0.539683,0.458533,0.081150
121467,0.047587,0.025627,0.021960
185846,0.009560,0.013349,-0.003789
163599,0.026712,0.031419,-0.004706
...,...,...,...
119620,0.029246,0.026561,0.002685
239958,0.366496,0.474771,-0.108276
39577,0.026327,0.027192,-0.000865
284720,0.415804,0.461154,-0.045350
