In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Firstly, We should analyze the given data.

In [3]:
train_data_path = 'https://raw.githubusercontent.com/Burxoniddin-data/project_data_science/main/train_data.csv'
train_data = pd.read_csv(train_data_path)
train_data.head()

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


In [4]:
train_data.shape

(20000, 12)

In [5]:
train_data.drop(['id', 'flight'], axis=1, inplace=True)
train_data

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,Vistara,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,SpiceJet,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,Vistara,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,Vistara,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,Indigo,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.00,4,10712
...,...,...,...,...,...,...,...,...,...,...
19995,Indigo,Bangalore,Night,one,Early_Morning,Mumbai,Economy,7.92,45,3153
19996,AirAsia,Kolkata,Morning,one,Afternoon,Delhi,Economy,5.83,24,3911
19997,Vistara,Chennai,Early_Morning,two_or_more,Evening,Bangalore,Economy,35.33,17,14822
19998,Vistara,Mumbai,Evening,one,Morning,Bangalore,Economy,16.33,21,6450


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           20000 non-null  object 
 1   source_city       20000 non-null  object 
 2   departure_time    20000 non-null  object 
 3   stops             20000 non-null  object 
 4   arrival_time      20000 non-null  object 
 5   destination_city  20000 non-null  object 
 6   class             20000 non-null  object 
 7   duration          20000 non-null  float64
 8   days_left         20000 non-null  int64  
 9   price             20000 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 1.5+ MB


In [7]:
missing_values = train_data.isnull().sum()
missing_values

airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [8]:
train_data.describe()

Unnamed: 0,duration,days_left,price
count,20000.0,20000.0,20000.0
mean,12.177627,25.92415,20960.2817
std,7.157944,13.624874,22775.459535
min,0.83,1.0,1105.0
25%,6.83,14.0,4783.0
50%,11.25,26.0,7425.0
75%,16.08,38.0,42521.0
max,38.58,49.0,114523.0


In [9]:
train_data['stops'].unique()

array(['one', 'zero', 'two_or_more'], dtype=object)

In [10]:
X = train_data.drop(columns=['price'])
y = train_data['price']

categorical_cols = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
numerical_cols = ['duration', 'days_left']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OrdinalEncoder(), categorical_cols)
    ])
X = preprocessor.fit_transform(X)
X

array([[ 0.28952803, -0.36141789,  5.        , ...,  5.        ,
         5.        ,  1.        ],
       [-1.45682723, -1.38897606,  4.        , ...,  5.        ,
         4.        ,  1.        ],
       [-0.36291029, -1.53577008,  5.        , ...,  2.        ,
         2.        ,  0.        ],
       ...,
       [ 3.23458153, -0.65500594,  5.        , ...,  2.        ,
         0.        ,  1.        ],
       [ 0.58012154, -0.36141789,  5.        , ...,  4.        ,
         0.        ,  1.        ],
       [ 0.84836171,  0.00556716,  5.        , ...,  4.        ,
         1.        ,  1.        ]])

## Now we should clean the data

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
clr = RandomForestRegressor(random_state=42)

In [14]:
clr.fit(X_train, y_train)

In [15]:
y_pred = clr.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
mae, mse, r2

(2066.4036846904764, 15445652.242307568, 0.9705862046801671)

## Now we should predict the test data

In [17]:
test_data_path = 'https://raw.githubusercontent.com/Burxoniddin-data/project_data_science/main/test_data.csv'
test_data = pd.read_csv(test_data_path)
test_data.head()

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
1,2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
2,3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.0,30
3,4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
4,5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35


In [18]:
test_data.drop(['id', 'flight'], axis=1, inplace=True)
X_final_val = preprocessor.fit_transform(test_data)
X_final_val

array([[ 2.17930407, -1.75411081,  1.        , ...,  5.        ,
         2.        ,  0.        ],
       [ 0.20548051,  0.58318436,  5.        , ...,  5.        ,
         5.        ,  0.        ],
       [-1.41382134,  0.29102246,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [-0.15862287,  0.80230578,  1.        , ...,  5.        ,
         5.        ,  0.        ],
       [ 0.13704002,  0.87534626,  1.        , ...,  5.        ,
         3.        ,  0.        ],
       [ 0.51346269,  1.67879147,  5.        , ...,  0.        ,
         2.        ,  0.        ]])

In [35]:
y_final_pred = clr.predict(X_final_val)
y_final_pred

array([52969.43, 57209.26, 22998.74, ..., 48834.93, 47190.99, 61166.73])

In [39]:
test_data["price"] = y_final_pred
test_data.to_csv("predictions.csv")