In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [182]:
df = pd.read_csv("/content/flightPrice.csv")

In [183]:
df

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare
0,2023-01-16,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335
1,2023-01-16,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899
2,2023-01-16,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801
3,2023-01-16,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794
4,2023-01-16,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955
...,...,...,...,...,...,...,...,...,...,...,...,...,...
452083,2023-03-06,Monday,Vistara,UK-926,Business,Ahmedabad,6 AM - 12 PM,1-stop,After 6 PM,Chennai,13.0833,50,65028
452084,2023-03-06,Monday,Vistara,UK-918,Business,Ahmedabad,Before 6 AM,1-stop,12 PM - 6 PM,Chennai,11.2500,50,69254
452085,2023-03-06,Monday,Vistara,UK-918,Business,Ahmedabad,Before 6 AM,1-stop,12 PM - 6 PM,Chennai,11.2500,50,69254
452086,2023-03-06,Monday,Vistara,UK-946,Business,Ahmedabad,6 AM - 12 PM,1-stop,After 6 PM,Chennai,11.1667,50,72980


In [184]:
df.isnull().sum()

Date_of_journey      0
Journey_day          0
Airline              0
Flight_code          0
Class                0
Source               0
Departure            0
Total_stops          0
Arrival              0
Destination          0
Duration_in_hours    0
Days_left            0
Fare                 0
dtype: int64

In [185]:
df['Date_of_journey'] = pd.to_datetime(df['Date_of_journey'], format='%Y-%m-%d')
df["Day"] = df["Date_of_journey"].dt.day
df["Month"] = df["Date_of_journey"].dt.month
df["Year"] = df["Date_of_journey"].dt.year
df.drop(columns=["Date_of_journey"], inplace=True)

In [186]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

In [187]:
columns_to_encode = ['Class', 'Source','Airline','Journey_day','Destination']

In [188]:
one_hot_encoded_data = encoder.fit_transform(df[columns_to_encode])

In [189]:
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))

one_hot_encoded_df.index = df.index

df.drop(columns=columns_to_encode, inplace=True)

df = pd.concat([df, one_hot_encoded_df], axis=1)


In [190]:
label_encoder = LabelEncoder()
df["Total_stops_encoded"] = label_encoder.fit_transform(df["Total_stops"])

df["Total_stops"] = df["Total_stops_encoded"]

In [191]:
df.drop(columns=["Flight_code"], inplace=True)

In [192]:
df["Departure"].unique()

array(['After 6 PM', 'Before 6 AM', '12 PM - 6 PM', '6 AM - 12 PM'],
      dtype=object)

In [193]:
df["Arrival"].unique()

array(['After 6 PM', 'Before 6 AM', '6 AM - 12 PM', '12 PM - 6 PM'],
      dtype=object)

In [194]:
Time_Slots = ['After 6 PM', 'Before 6 AM', '12 PM - 6 PM', '6 AM - 12 PM']

In [195]:
encoder = OneHotEncoder(categories=[Time_Slots,Time_Slots], drop='first', sparse_output=False)

In [196]:
one_hot_encoded_data = encoder.fit_transform(df[['Arrival', 'Departure']])

feature_names = encoder.get_feature_names_out(['Arrival', 'Departure'])

In [197]:
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=feature_names)
one_hot_encoded_df.index = df.index

In [198]:
df.drop(columns=['Arrival', 'Departure'], inplace=True)

In [199]:
df = pd.concat([df, one_hot_encoded_df], axis=1)

In [200]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452088 entries, 0 to 452087
Data columns (total 43 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Total_stops             452088 non-null  int64  
 1   Duration_in_hours       452088 non-null  float64
 2   Days_left               452088 non-null  int64  
 3   Fare                    452088 non-null  int64  
 4   Day                     452088 non-null  int32  
 5   Month                   452088 non-null  int32  
 6   Year                    452088 non-null  int32  
 7   Class_Economy           452088 non-null  float64
 8   Class_First             452088 non-null  float64
 9   Class_Premium Economy   452088 non-null  float64
 10  Source_Bangalore        452088 non-null  float64
 11  Source_Chennai          452088 non-null  float64
 12  Source_Delhi            452088 non-null  float64
 13  Source_Hyderabad        452088 non-null  float64
 14  Source_Kolkata      

In [201]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [202]:
y = df['Fare']
X = df.drop(columns=['Fare'])

In [203]:
X = sm.add_constant(X)

In [204]:
model = sm.OLS(y, X).fit()

In [205]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Fare   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                 6.385e+04
Date:                Wed, 22 May 2024   Prob (F-statistic):               0.00
Time:                        19:54:37   Log-Likelihood:            -4.6974e+06
No. Observations:              452088   AIC:                         9.395e+06
Df Residuals:                  452047   BIC:                         9.395e+06
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Total_stops            -1661

In [206]:
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

  vif = 1. / (1. - r_squared_i)


                   Feature          VIF
0              Total_stops          inf
1        Duration_in_hours     1.570360
2                Days_left   426.854725
3                      Day   176.346354
4                    Month   745.900733
5                     Year  4228.517933
6            Class_Economy     1.635916
7              Class_First     1.005208
8    Class_Premium Economy     1.511946
9         Source_Bangalore     2.306218
10          Source_Chennai     2.072468
11            Source_Delhi     2.454650
12        Source_Hyderabad     2.045468
13          Source_Kolkata     2.023243
14           Source_Mumbai     2.409069
15         Airline_AirAsia     1.189139
16        Airline_AkasaAir     1.072930
17     Airline_AllianceAir     1.011099
18        Airline_GO FIRST     1.135770
19          Airline_Indigo     1.835800
20        Airline_SpiceJet     1.060705
21         Airline_StarAir     1.002113
22         Airline_Vistara     1.693526
23      Journey_day_Monday     1.805282


In [207]:
# Extract p-values
p_values = model.pvalues

In [208]:
threshold = 0.05

In [209]:
features_above_threshold = p_values[p_values > threshold]

In [210]:
print("Features with p-value greater than 0.05:")
print(features_above_threshold)

Features with p-value greater than 0.05:
Airline_Indigo           0.079127
Airline_SpiceJet         0.063478
Journey_day_Monday       0.311049
Journey_day_Wednesday    0.592350
Destination_Delhi        0.944088
Arrival_12 PM - 6 PM     0.055597
dtype: float64


In [211]:
# Drop the 'Destination_Delhi' column as it has the highest value of p
X = X.drop(columns=['Destination_Delhi'])

So we have dropped the column with the highest p value and now we will fil this on OLS to see what changes we got.

In [212]:
# Add a constant term to the feature matrix
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [213]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Fare   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                 6.549e+04
Date:                Wed, 22 May 2024   Prob (F-statistic):               0.00
Time:                        19:59:17   Log-Likelihood:            -4.6974e+06
No. Observations:              452088   AIC:                         9.395e+06
Df Residuals:                  452048   BIC:                         9.395e+06
Df Model:                          39                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Total_stops            -1661

We didnt see any changes lets try droppping another values with highest p now

In [214]:
def extract_features_above_threshold(model, threshold=0.05):
    """
    Extract features with p-values greater than a specified threshold from the model.

    Args:
    - model: Fitted statsmodels regression model.
    - threshold (float): Threshold for p-values. Default is 0.05.

    Returns:
    - features_above_threshold: Series containing features with p-values greater than the threshold.
    """
    # Extract p-values
    p_values = model.pvalues

    # Filter features with p-values greater than the threshold
    features_above_threshold = p_values[p_values > threshold]

    return features_above_threshold

# Usage example
features_above_threshold = extract_features_above_threshold(model)
print("Features with p-value greater than 0.05:")
print(features_above_threshold)


Features with p-value greater than 0.05:
Airline_Indigo           0.079326
Airline_SpiceJet         0.063627
Journey_day_Monday       0.311009
Journey_day_Wednesday    0.592274
Arrival_12 PM - 6 PM     0.054994
dtype: float64


In [215]:
# Drop the 'Journey_day_Wednesday' column as it has the highest value of p
X = X.drop(columns=['Journey_day_Wednesday'])

In [216]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [217]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Fare   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                 6.721e+04
Date:                Wed, 22 May 2024   Prob (F-statistic):               0.00
Time:                        19:59:38   Log-Likelihood:            -4.6974e+06
No. Observations:              452088   AIC:                         9.395e+06
Df Residuals:                  452049   BIC:                         9.395e+06
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Total_stops            -1661

Still No Effect, Let's repeat this process till we see any changes or we have all values less than 0.05

In [218]:
 def extract_features_above_threshold(model, threshold=0.05):
    """
    Extract features with p-values greater than a specified threshold from the model.

    Args:
    - model: Fitted statsmodels regression model.
    - threshold (float): Threshold for p-values. Default is 0.05.

    Returns:
    - features_above_threshold: Series containing features with p-values greater than the threshold.
    """
    # Extract p-values
    p_values = model.pvalues

    # Filter features with p-values greater than the threshold
    features_above_threshold = p_values[p_values > threshold]

    return features_above_threshold

# Usage example
features_above_threshold = extract_features_above_threshold(model)
print("Features with p-value greater than 0.05:")
print(features_above_threshold)


Features with p-value greater than 0.05:
Airline_Indigo          0.079512
Airline_SpiceJet        0.063380
Journey_day_Monday      0.389517
Arrival_12 PM - 6 PM    0.055017
dtype: float64


We see this time "Journey_day_Monday" has highest p value

In [219]:
# Drop the 'Journey_day_Monday' column as it has the highest value of p
X = X.drop(columns=['Journey_day_Monday'])

In [220]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [221]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Fare   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                 6.903e+04
Date:                Wed, 22 May 2024   Prob (F-statistic):               0.00
Time:                        20:01:26   Log-Likelihood:            -4.6974e+06
No. Observations:              452088   AIC:                         9.395e+06
Df Residuals:                  452050   BIC:                         9.395e+06
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Total_stops            -1661

In [222]:
 def extract_features_above_threshold(model, threshold=0.05):
    """
    Extract features with p-values greater than a specified threshold from the model.

    Args:
    - model: Fitted statsmodels regression model.
    - threshold (float): Threshold for p-values. Default is 0.05.

    Returns:
    - features_above_threshold: Series containing features with p-values greater than the threshold.
    """
    # Extract p-values
    p_values = model.pvalues

    # Filter features with p-values greater than the threshold
    features_above_threshold = p_values[p_values > threshold]

    return features_above_threshold

# Usage example
features_above_threshold = extract_features_above_threshold(model)
print("Features with p-value greater than 0.05:")
print(features_above_threshold)


Features with p-value greater than 0.05:
Airline_Indigo          0.079295
Airline_SpiceJet        0.063518
Arrival_12 PM - 6 PM    0.054432
dtype: float64


Now drop "Airline_Indigo" as it has highest p value for the remaining values

In [223]:
# Drop the 'Airline_Indigo' column as it has the highest value of p
X = X.drop(columns=['Airline_Indigo'])

In [224]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [225]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Fare   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                 7.094e+04
Date:                Wed, 22 May 2024   Prob (F-statistic):               0.00
Time:                        20:01:44   Log-Likelihood:            -4.6974e+06
No. Observations:              452088   AIC:                         9.395e+06
Df Residuals:                  452051   BIC:                         9.395e+06
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Total_stops            -1661

In [229]:
X = X.drop(columns=['Airline_SpiceJet'])
X = X.drop(columns=['Arrival_12 PM - 6 PM'])

In [230]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [232]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Fare   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                 7.512e+04
Date:                Wed, 22 May 2024   Prob (F-statistic):               0.00
Time:                        20:04:38   Log-Likelihood:            -4.6974e+06
No. Observations:              452088   AIC:                         9.395e+06
Df Residuals:                  452053   BIC:                         9.395e+06
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Total_stops            -1662

In [234]:
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF values
print(vif_data)

# Check how many features have VIF values greater than 10
high_vif_features = vif_data[vif_data["VIF"] > 10]
print(f"Number of features with VIF greater than 10: {len(high_vif_features)}")
print(high_vif_features)

  vif = 1. / (1. - r_squared_i)


                   Feature          VIF
0              Total_stops          inf
1        Duration_in_hours     1.342628
2                Days_left   426.223334
3                      Day   175.843188
4                    Month   744.641102
5                     Year  4210.917336
6            Class_Economy     1.473128
7              Class_First     1.004592
8    Class_Premium Economy     1.468700
9         Source_Bangalore     2.298631
10          Source_Chennai     2.065289
11            Source_Delhi     2.385373
12        Source_Hyderabad     2.033664
13          Source_Kolkata     2.015463
14           Source_Mumbai     2.381759
15         Airline_AirAsia     1.076514
16        Airline_AkasaAir     1.049735
17     Airline_AllianceAir     1.007045
18        Airline_GO FIRST     1.062256
19         Airline_StarAir     1.001454
20         Airline_Vistara     1.407625
21    Journey_day_Saturday     1.135548
22      Journey_day_Sunday     1.142564
23    Journey_day_Thursday     1.137259
