In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [2]:
df = pd.read_csv("Clean_Dataset.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [4]:
df.columns

Index(['Unnamed: 0', 'airline', 'flight', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'duration',
       'days_left', 'price'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [6]:
df.drop("Unnamed: 0", axis = 1, inplace = True)

In [7]:
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [8]:
df.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


In [9]:
df[df.duplicated()]

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price


In [10]:
df["airline"].unique()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo',
       'Air_India'], dtype=object)

In [11]:
df["flight"].unique()

array(['SG-8709', 'SG-8157', 'I5-764', ..., '6E-7127', '6E-7259',
       'AI-433'], dtype=object)

In [12]:
df.nunique()

airline                 6
flight               1561
source_city             6
departure_time          6
stops                   3
arrival_time            6
destination_city        6
class                   2
duration              476
days_left              49
price               12157
dtype: int64

In [13]:
df["stops"].unique()

array(['zero', 'one', 'two_or_more'], dtype=object)

In [14]:
df["stops"] = df["stops"].str.replace("zero", "0")

In [15]:
df["stops"] = df["stops"].str.replace("one", "1")

In [16]:
df["stops"] = df["stops"].str.replace("two_or_more", "2")

In [17]:
df["stops"].unique()

array(['0', '1', '2'], dtype=object)

In [18]:
df["stops"] = df["stops"].astype(int)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  int64  
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 25.2+ MB


In [20]:
df.drop("flight", axis = 1, inplace=True)

In [21]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.tree import DecisionTreeRegressor 

In [22]:
X = df.drop("price", axis = 1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=15)

In [23]:
categorical_cols = ["airline", "source_city", "departure_time", "arrival_time", "destination_city", "class"]
numeric_cols = ["stops", "duration", "days_left"]

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop = "first", handle_unknown="ignore"), categorical_cols)
    ], remainder="passthrough"
)

In [25]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [26]:
encoded_cols = preprocessor.get_feature_names_out()

In [27]:
encoded_cols

array(['cat__airline_Air_India', 'cat__airline_GO_FIRST',
       'cat__airline_Indigo', 'cat__airline_SpiceJet',
       'cat__airline_Vistara', 'cat__source_city_Chennai',
       'cat__source_city_Delhi', 'cat__source_city_Hyderabad',
       'cat__source_city_Kolkata', 'cat__source_city_Mumbai',
       'cat__departure_time_Early_Morning', 'cat__departure_time_Evening',
       'cat__departure_time_Late_Night', 'cat__departure_time_Morning',
       'cat__departure_time_Night', 'cat__arrival_time_Early_Morning',
       'cat__arrival_time_Evening', 'cat__arrival_time_Late_Night',
       'cat__arrival_time_Morning', 'cat__arrival_time_Night',
       'cat__destination_city_Chennai', 'cat__destination_city_Delhi',
       'cat__destination_city_Hyderabad', 'cat__destination_city_Kolkata',
       'cat__destination_city_Mumbai', 'cat__class_Economy',
       'remainder__stops', 'remainder__duration', 'remainder__days_left'],
      dtype=object)

In [28]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1656758 stored elements and shape (210107, 29)>

In [29]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [30]:
X_train = pd.DataFrame(X_train, columns=encoded_cols)
X_test = pd.DataFrame(X_test, columns=encoded_cols)

In [31]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210107 entries, 0 to 210106
Data columns (total 29 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   cat__airline_Air_India             210107 non-null  float64
 1   cat__airline_GO_FIRST              210107 non-null  float64
 2   cat__airline_Indigo                210107 non-null  float64
 3   cat__airline_SpiceJet              210107 non-null  float64
 4   cat__airline_Vistara               210107 non-null  float64
 5   cat__source_city_Chennai           210107 non-null  float64
 6   cat__source_city_Delhi             210107 non-null  float64
 7   cat__source_city_Hyderabad         210107 non-null  float64
 8   cat__source_city_Kolkata           210107 non-null  float64
 9   cat__source_city_Mumbai            210107 non-null  float64
 10  cat__departure_time_Early_Morning  210107 non-null  float64
 11  cat__departure_time_Evening        2101

In [32]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)

In [34]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 
from sklearn.model_selection import RandomizedSearchCV 

In [35]:
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))

R2 Score:  0.9756907291949359
Mean Absolute Error:  1192.648585167581
Mean Squared Error:  12530260.951821847


In [36]:
# hyperparameter tuning 

In [44]:
params = {
    "criterion": ["squared_error", "friedman_mse", "poisson", "absolute_error"], 
    "max_depth": [10,15,20],  
    "max_features": ["log2", "sqrt"]
} 

In [45]:
rsv = RandomizedSearchCV(estimator=DecisionTreeRegressor(), cv = 5, param_distributions=params, n_jobs=-1)
rsv.fit(X_train, y_train)

Exception ignored in: <function ResourceTracker.__del__ at 0x107299080>
Traceback (most recent call last):
  File "/Users/batuhanbasoda/Desktop/CENG/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/batuhanbasoda/Desktop/CENG/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/batuhanbasoda/Desktop/CENG/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x111115080>
Traceback (most recent call last):
  File "/Users/batuhanbasoda/Desktop/CENG/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/batuhanbasoda/Desktop/CENG/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/batuhanbasoda/Desktop/CENG/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 

In [46]:
y_pred = rsv.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))

R2 Score:  0.971216782995103
Mean Absolute Error:  1778.144005316842
Mean Squared Error:  14836365.228575384


In [47]:
rsv.best_params_

{'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'poisson'}