In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

#Reading the data

In [243]:
df = pd.read_csv("/content/drive/MyDrive/sertificat/train_data.csv", index_col=0, skipinitialspace=True)
df=df.drop('flight',axis=1)
df.head(2)

Unnamed: 0_level_0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Vistara,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
2,SpiceJet,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292


In [244]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 1 to 20000
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           20000 non-null  object 
 1   source_city       20000 non-null  object 
 2   departure_time    20000 non-null  object 
 3   stops             20000 non-null  object 
 4   arrival_time      20000 non-null  object 
 5   destination_city  20000 non-null  object 
 6   class             20000 non-null  object 
 7   duration          20000 non-null  float64
 8   days_left         20000 non-null  int64  
 9   price             20000 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 1.7+ MB


###onverting Categorical values into integer

In [245]:
df.replace({"departure_time":{"Early_Morning":0,"Morning":1,"Afternoon":3,"Evening":4,"Night":5,"Late_Night":6}},inplace=True)
df.replace({"arrival_time":{"Early_Morning":1,"Morning":2,"Afternoon":3,"Evening":4,"Night":5,"Late_Night":0}},inplace=True)
df.replace({"airline":{"Vistara":6,"Air_India":5,"Indigo":2,"GO_FIRST":3,"AirAsia":1,"SpiceJet":4}},inplace=True)
df.replace({"class":{"Economy":1,"Business":10}},inplace=True)
df.replace({"stops":{"zero":1,"one":2,"two_or_more":4}},inplace=True)
#df.replace({"source_city":{"Mumbai":1,"Delhi":2,"Bangalore":3,"Kolkata":4,"Hyderabad":5,"Chennai":6}},inplace=True)
#df.replace({"destination_city":{"Mumbai":1,"Delhi":2,"Bangalore":3,"Kolkata":4,"Hyderabad":5,"Chennai":6}},inplace=True)
df.head()

Unnamed: 0_level_0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,6,Bangalore,0,2,5,Mumbai,1,14.25,21,7212
2,4,Hyderabad,4,1,5,Kolkata,1,1.75,7,5292
3,6,Bangalore,1,2,4,Delhi,10,9.58,5,60553
4,6,Kolkata,1,2,4,Hyderabad,1,6.75,28,5760
5,2,Chennai,0,1,2,Mumbai,1,2.0,4,10712


###Splitting the data to the train test

In [246]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.15,random_state=42)

X_train = train_set.drop("price",axis=1)
y = train_set["price"].copy()

X_num = X_train.drop(['destination_city','source_city'], axis=1)

###Pipeline

In [247]:
from sklearn.base import BaseEstimator, TransformerMixin
airline_ix,  class_ix, duration_ix, stops_ix= 0, 4,5,2

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  def fit(self, X, y=None):
    return self
  def transform(self,X):
    #stpsperair = X[:,stops_ix] / X[:,airline_ix]
    #durperstps = X[:,duration_ix] / X[:,stops_ix]
    busclperair = X[:,class_ix] / X[:,airline_ix]


    return np.c_[X,busclperair]


In [248]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('std_acaler',StandardScaler()),
    ('attribs_adder', CombinedAttributesAdder())
])

###Full Pipeline

In [249]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['destination_city','source_city']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [250]:
X_prepared = full_pipeline.fit_transform(X_train)

In [251]:
X_prepared

array([[-0.96153253, -1.28988917,  0.05917813, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.24321784, -1.28988917,  0.05917813, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.84559302, -0.75560439,  0.05917813, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.24321784, -0.75560439,  0.05917813, ...,  0.        ,
         0.        ,  0.        ],
       [-0.96153253, -1.28988917,  0.05917813, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56390771, -0.75560439,  0.05917813, ...,  0.        ,
         0.        ,  1.        ]])

###Machine Learning

###LinearRegression

In [252]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()
LR_model.fit(X_prepared,y)

In [253]:
X_test = test_set.drop('price',axis=1)
X_test.head(3)


Unnamed: 0_level_0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10651,3,Hyderabad,1,2,5,Delhi,1,13.58,49
2042,6,Delhi,4,2,2,Bangalore,10,16.08,9
8669,6,Delhi,4,2,2,Chennai,10,15.08,9


In [254]:
y_test = test_set['price'].copy()
y_test

id
10651     5361
2042     50297
8669     60232
1115     74365
13903     4417
         ...  
3980     11888
12386    49207
8587      5276
11214     6583
13807    64285
Name: price, Length: 3000, dtype: int64

In [255]:
X_test_prepared = full_pipeline.transform(X_test)

In [256]:
y_predicted=LR_model.predict(X_test_prepared)

In [261]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse=mean_absolute_error(y_test, y_predicted)

print(f"{mse=}")

mse=2163.3796666666667


###DecisionTreeRegressor

In [258]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)


In [259]:
y_predicted = Tree_model.predict(X_test_prepared)

In [262]:
mse=mean_absolute_error(y_test, y_predicted)

print(f"{mse=}")

mse=2163.3796666666667
