# Airline(Model_tuning&Deployment)

## Read data

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv('Airline.csv')

In [3]:
df

Unnamed: 0,Airline,month(2019),Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,March,Banglore,New Delhi,BLR → DEL,170,0,No info,3897
1,Jet Airways,June,Delhi,Cochin,DEL → LKO → BOM → COK,1140,2,No info,13882
2,SpiceJet,June,Kolkata,Banglore,CCU → BLR,145,0,No info,3873
3,Jet Airways,March,Banglore,New Delhi,BLR → BOM → DEL,930,1,In-flight meal not included,11087
4,Jet Airways,March,Banglore,New Delhi,BLR → BOM → DEL,1265,1,No info,22270
...,...,...,...,...,...,...,...,...,...
9775,Air Asia,April,Kolkata,Banglore,CCU → BLR,150,0,No info,4107
9776,Air India,April,Kolkata,Banglore,CCU → BLR,155,0,No info,4145
9777,Jet Airways,April,Banglore,Delhi,BLR → DEL,180,0,No info,7229
9778,Vistara,March,Banglore,New Delhi,BLR → DEL,160,0,No info,12648


## import libs

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder ,LabelEncoder, StandardScaler , PolynomialFeatures , FunctionTransformer
from category_encoders import BinaryEncoder
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV 
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import time as te

## column Transformer

In [5]:
Encoding = ColumnTransformer(transformers=[('Encoding1',OneHotEncoder(sparse_output=False , drop='first'),[0,1,2,3,7]) , 
                                           ('Encoding2' , BinaryEncoder(),[4])], remainder='passthrough')

## x,y

In [6]:
x = df.drop('Price' , axis=1)
y = df['Price']

## model tuning (KNN,XG,RF)

### KNN

#### steps list

In [7]:
knn_steps = [
        ('encoders',Encoding),
        #('PolynomiaL', PolynomialFeatures(degree=2)),
        ('scaling' , StandardScaler()),
        ("FeaSel", SequentialFeatureSelector(estimator=KNeighborsRegressor(), n_features_to_select=0.95, direction='forward', scoring='r2', cv=5)),
        ('KNN',KNeighborsRegressor())
    ]
knn_pipeline = Pipeline(steps = knn_steps)

#### Params

In [8]:
knn_params = {
    'KNN__n_neighbors':[5,6,7,8,9,10],
    'KNN__weights':['uniform', 'distance'],
    'KNN__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']
}

#### grid

In [9]:
knn_grid = GridSearchCV(
    knn_pipeline,
    param_grid=knn_params,
    scoring='r2',
    cv=5,
    return_train_score=True,
    error_score='raise'
)

In [None]:
print(f'start time is {te.asctime()}')
knn_grid.fit(x,y)
print(f'end time is {te.asctime()}')

start time is Sun Jan  7 10:47:53 2024


In [None]:
knn_grid.best_score_

In [None]:
knn_grid.best_params_

## other models (This model takes a long time to run the code, but there are no results.)

### XG

#### steps list

In [14]:
xg_steps = [
        ('encoders',Encoding),
        ('scaling' , RobustScaler()),
        ("FeaSel", SequentialFeatureSelector(estimator=XGBRegressor(), n_features_to_select=0.95, direction='forward', scoring='r2', cv=5)),
        ('XG',XGBRegressor())
    ]
xg_pipeline = Pipeline(steps = xg_steps)

#### Params

In [15]:
xg_param = {
    'XG__learning_rate': [0.01, 0.1, 0.2],
    'XG__n_estimators': [100, 200,300,400],
    'XG__max_depth': [3, 5, 7],
    'XG__min_child_weight': [1, 3, 5],
    'XG__subsample': [0.8, 1.0],
    'XG__colsample_bytree': [0.8, 1.0],
    'XG__gamma': [0, 0.1, 0.2]
}

#### grid

In [16]:
xg_grid = GridSearchCV(
    xg_pipeline,
    param_grid=xg_param,
    scoring='r2',
    cv=5,
    return_train_score=True,
    error_score='raise'
)

In [17]:
print(f'start time is {te.asctime()}')
xg_grid.fit(x,y)
print(f'end time is {te.asctime()}')

start time is Thu Jan  4 03:54:42 2024


KeyboardInterrupt: 

In [None]:
xg_grid.best_score_

In [None]:
xg_grid.best_params_

### RF

#### steps list

In [None]:
rf_steps = [
        ('encoders',Encoding),
        ('scaling' , RobustScaler()),
        ("FeaSel", SequentialFeatureSelector(estimator=RandomForestRegressor(), n_features_to_select=0.95, direction='forward', scoring='r2', cv=5)),
        ('RF',RandomForestRegressor())
    ]
rf_pipeline = Pipeline(steps = rf_steps)

#### params

In [None]:
rf_param = {
    'RF__n_estimators': [100, 200,300,400],
    'RF__max_depth': [None, 10, 20, 30],
    'RF__min_samples_split': [2, 5, 10],
    'RF__min_samples_leaf': [1, 2, 4],
    'RF__max_features': ['auto', 'sqrt', 'log2'],
    'RF__bootstrap': [True, False]
}

#### grid

In [None]:
rf_grid =  GridSearchCV(
    rf_pipeline,
    param_grid=rf_param,
    scoring='r2',
    cv=5,
    return_train_score=True,
    error_score='raise'
)

In [None]:
print(f'start time is {te.asctime()}')
rf_grid.fit(x,y)
print(f'end time is {te.asctime()}')

In [None]:
rf_grid.best_score_

In [None]:
rf_grid.best_params_

## save model

In [14]:
model = knn_grid.best_estimator_.fit(x,y)

In [15]:
import joblib

In [16]:
joblib.dump(model,'KNN_model.pkl')

['KNN_model.pkl']

In [17]:
model