In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [6]:
sample = pd.read_csv("sample_solution.csv", index_col='id')
sample

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0
...,...
4996,0
4997,0
4998,0
4999,0


In [7]:
test = pd.read_csv("test_data.csv", index_col='id')
test

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.00,30
4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35
...,...,...,...,...,...,...,...,...,...,...
4996,Air_India,AI-768,Kolkata,Afternoon,one,Morning,Bangalore,Business,17.42,15
4997,Indigo,6E-6214,Kolkata,Morning,zero,Afternoon,Mumbai,Economy,3.00,40
4998,Air_India,AI-402,Kolkata,Morning,one,Night,Mumbai,Business,11.17,37
4999,Air_India,AI-673,Mumbai,Early_Morning,one,Night,Hyderabad,Business,13.33,38


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
train = pd.read_csv("train_data.csv", index_col='id')
train

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.00,4,10712
...,...,...,...,...,...,...,...,...,...,...,...
19996,Indigo,6E-6178,Bangalore,Night,one,Early_Morning,Mumbai,Economy,7.92,45,3153
19997,AirAsia,I5-582,Kolkata,Morning,one,Afternoon,Delhi,Economy,5.83,24,3911
19998,Vistara,UK-832,Chennai,Early_Morning,two_or_more,Evening,Bangalore,Economy,35.33,17,14822
19999,Vistara,UK-996,Mumbai,Evening,one,Morning,Bangalore,Economy,16.33,21,6450


In [11]:
train.isnull().sum()

airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 1 to 20000
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           20000 non-null  object 
 1   flight            20000 non-null  object 
 2   source_city       20000 non-null  object 
 3   departure_time    20000 non-null  object 
 4   stops             20000 non-null  object 
 5   arrival_time      20000 non-null  object 
 6   destination_city  20000 non-null  object 
 7   class             20000 non-null  object 
 8   duration          20000 non-null  float64
 9   days_left         20000 non-null  int64  
 10  price             20000 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 1.8+ MB


In [13]:
train.describe()

Unnamed: 0,duration,days_left,price
count,20000.0,20000.0,20000.0
mean,12.177627,25.92415,20960.2817
std,7.157944,13.624874,22775.459535
min,0.83,1.0,1105.0
25%,6.83,14.0,4783.0
50%,11.25,26.0,7425.0
75%,16.08,38.0,42521.0
max,38.58,49.0,114523.0


In [14]:
train.corr()

  train.corr()


Unnamed: 0,duration,days_left,price
duration,1.0,-0.020091,0.213158
days_left,-0.020091,1.0,-0.102545
price,0.213158,-0.102545,1.0


In [15]:
# divided train dataset
train_without_price = train.drop("price", axis=1)
train_price = train["price"].copy()

In [16]:
# train and test datasets
train_data, test_data = train_test_split(train, test_size=0.2, random_state=42)
train.head()

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


In [17]:
# preparation for ML
num_attr = ['duration', 'days_left']
cat_attr = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attr),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attr)
])

train_prepared = full_pipeline.fit_transform(train_without_price)
train_prepared.toarray()[0:5,:]

array([[ 0.28952803, -0.36141789,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [-1.45682723, -1.38897606,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.36291029, -1.53577008,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.75828512,  0.15236119,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-1.42190012, -1.60916709,  0.        , ...,  1.        ,
         0.        ,  1.        ]])

In [18]:
# test dataset
test_without_price = test_data.drop('price', axis=1)
test_price = test_data['price'].copy()
airplane_test_prepared = full_pipeline.transform(test_without_price)

In [19]:
# Random Forest
RF_model = RandomForestRegressor()
RF_model.fit(train_prepared, train_price)

In [20]:
# Prediction and finding RMSE
predicted_ver = RF_model.predict(airplane_test_prepared)

lin_mse = mean_squared_error(test_price, predicted_ver)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

1180.4221118898852


In [22]:
#Comparing predicted price with actual price

# 1. Sample
test_data = train.sample(5)

# 2. Finding matches for these samples
test_label = train_price.loc[test_data.index]

# 3. Pipeline
test_data_prepared = full_pipeline.transform(test_data)

# 4. Predicting
predicted_data = RF_model.predict(test_data_prepared)
predicted_data

# 5. Demonstrating result in a beautiful way
pd.DataFrame({'Predict':predicted_data, 'Original price': test_label})

Unnamed: 0_level_0,Predict,Original price
id,Unnamed: 1_level_1,Unnamed: 2_level_1
9589,6361.66,7266
5235,54884.56,57939
9499,1551.0,1551
3417,62668.84,61933
10942,4232.1,3979


In [23]:
test_prep = full_pipeline.transform(test)
predict_price = RF_model.predict(test_prep)

In [25]:
sample['price'] = predict_price
sample

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
1,54066.99
2,62336.36
3,23361.04
4,2312.18
5,5623.68
...,...
4996,60970.82
4997,4465.00
4998,50271.00
4999,50557.50


In [28]:
sample.to_csv('sample_csv', index=False)