# Airline(feature selection and model evaluation)

## Read Data

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv('Airline-2.csv')

In [3]:
df

Unnamed: 0,Airline,month(2019),Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,March,Banglore,New Delhi,BLR → DEL,170,0,No info,3897
1,SpiceJet,June,Kolkata,Banglore,CCU → BLR,145,0,No info,3873
2,Jet Airways,March,Banglore,New Delhi,BLR → BOM → DEL,930,1,In-flight meal not included,11087
3,Jet Airways,March,Banglore,New Delhi,BLR → BOM → DEL,1530,1,In-flight meal not included,11087
4,Multiple carriers,May,Delhi,Cochin,DEL → BOM → COK,470,1,No info,8625
...,...,...,...,...,...,...,...,...,...
9045,SpiceJet,May,Banglore,Delhi,BLR → DEL,160,0,No check-in baggage included,3257
9046,Air Asia,April,Kolkata,Banglore,CCU → BLR,150,0,No info,4107
9047,Air India,April,Kolkata,Banglore,CCU → BLR,155,0,No info,4145
9048,Jet Airways,April,Banglore,Delhi,BLR → DEL,180,0,No info,7229


## import libs

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder  , StandardScaler
from category_encoders import BinaryEncoder
from sklearn.impute import KNNImputer ,SimpleImputer 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from sklearn.metrics import r2_score
from category_encoders import BinaryEncoder
from sklearn.model_selection import cross_validate 
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression ,Ridge ,Lasso,ElasticNet , HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import time as te

## Columns Transformer

In [5]:
df.columns

Index(['Airline', 'month(2019)', 'Source', 'Destination', 'Route', 'Duration',
       'Total_Stops', 'Additional_Info', 'Price'],
      dtype='object')

In [6]:
indexing = df.columns.to_list()
for i in ['Airline', 'month(2019)', 'Source', 'Destination', 'Route','Additional_Info'] :
    print(f'num of unique values in {i} = {len(df[i].unique().tolist())} and his index = {indexing.index(i)}')

num of unique values in Airline = 8 and his index = 0
num of unique values in month(2019) = 4 and his index = 1
num of unique values in Source = 5 and his index = 2
num of unique values in Destination = 6 and his index = 3
num of unique values in Route = 19 and his index = 4
num of unique values in Additional_Info = 3 and his index = 7


In [7]:
Encoding = ColumnTransformer(transformers=[('Encoding1',OneHotEncoder(sparse_output=False , drop='first'),[0,1,2,3,7]) , 
                                          ('Encoding2' , BinaryEncoder(), [4])] , remainder='passthrough')

## models list

In [8]:
models = [
    ('LR' , LinearRegression()),
    ('HR',HuberRegressor()),
    ('ER',ElasticNet()),
    ('LA',Lasso()),
    ('RI' , Ridge()),
    ('KNN' , KNeighborsRegressor()),
]

In [9]:
models2 = [
    ('RF' , RandomForestRegressor()),
    ('SVM',SVR()),
    ('XG',XGBRegressor())
]

## x,y

In [10]:
x = df.drop('Price' , axis=1)
y = df['Price']

In [11]:
x.head()

Unnamed: 0,Airline,month(2019),Source,Destination,Route,Duration,Total_Stops,Additional_Info
0,IndiGo,March,Banglore,New Delhi,BLR → DEL,170,0,No info
1,SpiceJet,June,Kolkata,Banglore,CCU → BLR,145,0,No info
2,Jet Airways,March,Banglore,New Delhi,BLR → BOM → DEL,930,1,In-flight meal not included
3,Jet Airways,March,Banglore,New Delhi,BLR → BOM → DEL,1530,1,In-flight meal not included
4,Multiple carriers,May,Delhi,Cochin,DEL → BOM → COK,470,1,No info


## Loops

### Loop1

In [None]:
print(f'start time is {te.asctime()}')
for model in models:
    print('_'*50)
    print('Model:', model[0])
    steps = [
        ('encoders',Encoding),
        ('scaling' , StandardScaler()),
        ("FeaSel", SequentialFeatureSelector(estimator=model[1], n_features_to_select=0.95, direction='forward', scoring='r2', cv=5)),
        ( model)
    ]
    pipeline = Pipeline(steps=steps)
    res = cross_validate(pipeline , x , y , scoring='r2',return_train_score=True )
    print("Train r2 score:", res['train_score'].mean())
    print("Test r2 scorer:", res['test_score'].mean())
    print(f'end time is for model {model[0]} {te.asctime()}')
print('_'*50)

start time is Mon Jan  8 06:59:16 2024
__________________________________________________
Model: LR
Train r2 score: 0.7576077817446046
Test r2 scorer: 0.7558032940005243
end time is for model LR Mon Jan  8 07:01:34 2024
__________________________________________________
Model: HR


### loop 2

In [None]:
print(f'start time is {te.asctime()}')
for model in models2:
    print('_'*50)
    print('Model:', model[0])
    steps = [
        ('encoders',Encoding),
        ('scaling' , StandardScaler()),
        ("FeaSel", SequentialFeatureSelector(estimator=model[1], n_features_to_select=0.95, direction='forward', scoring='r2', cv=5)),
        ( model)
    ]
    pipeline = Pipeline(steps=steps)
    res = cross_validate(pipeline , x , y , scoring='r2',return_train_score=True )
    print("Train r2 score:", res['train_score'].mean())
    print("Test r2 scorer:", res['test_score'].mean())
    print(f'end time is for model {model[0]} {te.asctime()}')
print('_'*50)