In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aviachipta-narxini-bashorat-qilish/train_data.csv
/kaggle/input/aviachipta-narxini-bashorat-qilish/test_data.csv
/kaggle/input/aviachipta-narxini-bashorat-qilish/sample_solution.csv


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier

In [5]:
train_df = pd.read_csv("/kaggle/input/aviachipta-narxini-bashorat-qilish/train_data.csv")
test_df = pd.read_csv("/kaggle/input/aviachipta-narxini-bashorat-qilish/test_data.csv")
train_df.head()

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


In [6]:
train_df.info()
print('_'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                20000 non-null  int64  
 1   airline           20000 non-null  object 
 2   flight            20000 non-null  object 
 3   source_city       20000 non-null  object 
 4   departure_time    20000 non-null  object 
 5   stops             20000 non-null  object 
 6   arrival_time      20000 non-null  object 
 7   destination_city  20000 non-null  object 
 8   class             20000 non-null  object 
 9   duration          20000 non-null  float64
 10  days_left         20000 non-null  int64  
 11  price             20000 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 1.8+ MB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column            

In [7]:
columns = ['airline', 'source_city', 'destination_city', 'departure_time', 'arrival_time', 'stops', 'class']

for col in columns:
    print(f"{col.capitalize():<20}", end="")
print()

max_len = max(len(train_df[col].unique()) for col in columns)

for i in range(max_len):
    for col in columns:
        unique_values = train_df[col].unique()
        if i < len(unique_values):
            print(f"{unique_values[i]:<20}", end="") 
        else:
            print(" " * 20, end="")
    print()

Airline             Source_city         Destination_city    Departure_time      Arrival_time        Stops               Class               
Vistara             Bangalore           Mumbai              Early_Morning       Night               one                 Economy             
SpiceJet            Hyderabad           Kolkata             Evening             Evening             zero                Business            
Indigo              Kolkata             Delhi               Morning             Morning             two_or_more                             
Air_India           Chennai             Hyderabad           Afternoon           Afternoon                                                   
GO_FIRST            Delhi               Chennai             Night               Early_Morning                                               
AirAsia             Mumbai              Bangalore           Late_Night          Late_Night                                                  


# Preprocessing to Machine Learning

In [8]:
def flight(data):
    data[['flight_country', 'flight_number']]=data["flight"].str.split("-", expand=True)
    data['flight_number']=data['flight_number'].astype('int64')
    
    data.drop(['id','flight', 'flight_number'], axis=1, inplace = True)
    
flight(train_df)

In [9]:
def preprocess(data, cat_cols, num_cols):
    prep_df = pd.DataFrame(index=data.index)
    
    for col in cat_cols:
        encoded_col = pd.get_dummies(data[col], prefix=col, dtype=np.int8)
        prep_df = pd.merge(prep_df, encoded_col, left_index=True, right_index=True)

    num_transformer = ColumnTransformer([
        ("scaler", MinMaxScaler(), num_cols)
    ])
    num_transformed = num_transformer.fit_transform(data[num_cols])
    num_df = pd.DataFrame(num_transformed, columns=num_cols, index=data.index)
    prep_df = pd.merge(prep_df, num_df, left_index=True, right_index=True)
    prep_df = pd.merge(prep_df, data.price, left_index=True, right_index=True)
    return prep_df

# Define categorical and numerical columns
cat_cols = ["airline", "source_city", "departure_time", "stops", "arrival_time", "destination_city", "class", "flight_country"]
num_cols = ["duration", "days_left"]


prepared_data = preprocess(train_df, cat_cols, num_cols)
prepared_data.head()

Unnamed: 0,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Bangalore,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,...,class_Economy,flight_country_6E,flight_country_AI,flight_country_G8,flight_country_I5,flight_country_SG,flight_country_UK,duration,days_left,price
0,0,0,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,1,0.355497,0.416667,7212
1,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,1,0,0.024371,0.125,5292
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0.231788,0.083333,60553
3,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0.156821,0.5625,5760
4,0,0,0,1,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0.030993,0.0625,10712


# Splitting Data

In [10]:
train_set, test_set = train_test_split(prepared_data, test_size=0.2, random_state=42)
X_train = train_set.drop("price", axis=1)
y_train = train_set["price"]
X_test = test_set.drop("price", axis=1)
y_test = test_set["price"]

# Model choosing

## Linear Regression

## Decision Tree

In [20]:
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train, y_train)

y_pred = dt_regressor.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

4966.6400293294355


In [None]:
{'max_depth': 64, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 50}

In [24]:
# param_grid = {
#     'max_depth': [2, 4, 8, 16, 32, 64, None],
#     'min_samples_split': [2,10, 30, 50, 100, 200, 300, 700],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Perform GridSearchCV to find the best parameters
# grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)

# # Print the best parameters and best score
# print("Best parameters found: ", grid_search.best_params_)
# print("Best score found: ", grid_search.best_score_)

# # Use the best estimator to make predictions
# best_dt_regressor = grid_search.best_estimator_
# y_pred_best = best_dt_regressor.predict(X_test)

# # Evaluate the best model
# mse_best = mean_squared_error(y_test, y_pred_best)
# print("Mean Squared Error of the best model:", mse_best)

In [28]:
dt_regressor = DecisionTreeRegressor(max_depth=64, min_samples_leaf= 2, min_samples_split= 50)
dt_regressor.fit(X_train, y_train)

y_pred = dt_regressor.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

4532.221949768601


## Random Forest Regression

In [30]:
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
lin_mse = mean_squared_error(y_test, y_pred)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

3796.2107730600237


In [31]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 5, 10, 15],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Create the GridSearchCV object
# rf = RandomForestRegressor()
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# # Perform the grid search
# grid_search.fit(X_train, y_train)

# # Print the best parameters found
# print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [38]:
rf_regressor = RandomForestRegressor(min_samples_leaf= 1, min_samples_split= 5, n_estimators = 200)
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
lin_mse = mean_squared_error(y_test, y_pred)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

3831.667282951203


## XGBoost

In [39]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

# Evaluate the model
lin_mse = mean_squared_error(y_test, y_pred)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

3638.477322093488
