In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [167]:
business = pd.read_csv('business.csv')
economy = pd.read_csv('economy.csv')

In [168]:
business['class'] = "business"
economy['class'] = 'economy'

In [169]:
df = pd.concat([economy,business], ignore_index=True)

In [170]:
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,11-02-2022,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,economy
1,11-02-2022,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953,economy
2,11-02-2022,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956,economy
3,11-02-2022,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955,economy
4,11-02-2022,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955,economy


In [171]:
df.ch_code.unique()

array(['SG', 'I5', 'UK', 'G8', '6E', 'AI', '2T', 'S5'], dtype=object)

In [172]:
df.num_code.unique()

array([8709, 8157,  764, ..., 7127, 7259,  433], dtype=int64)

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300261 entries, 0 to 300260
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        300261 non-null  object
 1   airline     300261 non-null  object
 2   ch_code     300261 non-null  object
 3   num_code    300261 non-null  int64 
 4   dep_time    300261 non-null  object
 5   from        300261 non-null  object
 6   time_taken  300261 non-null  object
 7   stop        300261 non-null  object
 8   arr_time    300261 non-null  object
 9   to          300261 non-null  object
 10  price       300261 non-null  object
 11  class       300261 non-null  object
dtypes: int64(1), object(11)
memory usage: 27.5+ MB


In [174]:
def stopsfind(stops):  
    stops = stops.replace('Via','').split()
    if stops[0] == "1-stop":
            stops[0] = "one"
    elif stops[0] == "non-stop":
            stops[0] = "zero"
    else:
            stops[0] = "two_or_more"
    if len(stops) == 1:
        return pd.Series([stops[0],"NAN"])
    else:
        return pd.Series([stops[0],stops[1]])

In [175]:
df[['stops','stops_via']] = df['stop'].apply(stopsfind)

In [176]:
df['price'] = df['price'].apply(lambda x: int(x.replace(',','')) )

In [177]:
def codemake(x,y):
    return x + "-" + str(y)

In [178]:
df['flight_code']=df[['ch_code','num_code']].apply(lambda x: codemake(*x),axis=1)

In [179]:
df.drop(['ch_code','num_code','stop'],axis=1,inplace=True)

In [180]:
df.date = df.date.str.split('-')

In [181]:
df['day_of_journey'] = df.date.str[0]
df['month_of_journey'] = df.date.str[1]

In [182]:
df.drop(['date'],axis=1,inplace=True)

In [183]:
df.dep_time = df.dep_time.str.split(':')

In [184]:
df['dep_hour'] = df.dep_time.str[0]
df['dep_min'] = df.dep_time.str[1]

In [185]:
df.drop(['dep_time'],axis=1,inplace=True)

In [186]:
df.arr_time = df.arr_time.str.split(':')

In [187]:
df['arr_hour'] = df.arr_time.str[0]
df['arr_min'] = df.arr_time.str[1]

In [188]:
df.drop(['arr_time'],axis=1,inplace=True)

In [189]:
def time_min(timing):
    timing = timing.replace('h','').replace('m','')
    
    if len(timing.split())==1:
        timing = timing.split('.')
    else:
        timing = timing.split()
        
    timing = int(timing[0])*60 + int(timing[1])

    return timing

In [190]:
df['duration_in_min']=df['time_taken'].apply(time_min)

In [191]:
df.drop(['time_taken'],axis=1,inplace=True)

In [192]:
df[df.duplicated()].head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,flight_code,day_of_journey,month_of_journey,dep_hour,dep_min,arr_hour,arr_min,duration_in_min
563,Air India,Delhi,Mumbai,12150,economy,one,NAN,AI-807,14,2,17,20,8,35,915
6181,Air India,Delhi,Mumbai,4780,economy,one,NAN,AI-475,13,3,13,0,13,35,1475


In [193]:
df.drop_duplicates(keep='first',inplace=True)
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,flight_code,day_of_journey,month_of_journey,dep_hour,dep_min,arr_hour,arr_min,duration_in_min
0,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8709,11,2,18,55,21,5,130
1,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8157,11,2,6,20,8,40,140
2,AirAsia,Delhi,Mumbai,5956,economy,zero,NAN,I5-764,11,2,4,25,6,35,130
3,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-995,11,2,10,20,12,35,135
4,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-963,11,2,8,50,11,10,140


In [194]:
df.dep_hour =  df.dep_hour.astype('int64')

In [195]:
df.arr_hour =  df.arr_hour.astype('int64')

In [196]:
def deptime(tmm):
    if tmm >= 4 and tmm <= 8:
        day = 'Early Morning'
    elif tmm > 8 and tmm <12:
        day = 'Morning'
    elif tmm >= 12 and tmm < 16:
        day = 'Afternoon'
    elif tmm >= 16 and tmm < 20:
        day = 'Evening'
    elif tmm >= 20 and tmm < 24 :
        day = 'Night'
    else:
        day = 'Late Night'
    return day

In [197]:
df['dep_time'] = df['dep_hour'].apply(deptime)

In [198]:
df['arr_time'] = df['arr_hour'].apply(deptime)

In [199]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,flight_code,day_of_journey,month_of_journey,dep_hour,dep_min,arr_hour,arr_min,duration_in_min,dep_time,arr_time
0,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8709,11,2,18,55,21,5,130,Evening,Night
1,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8157,11,2,6,20,8,40,140,Early Morning,Early Morning
2,AirAsia,Delhi,Mumbai,5956,economy,zero,NAN,I5-764,11,2,4,25,6,35,130,Early Morning,Early Morning
3,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-995,11,2,10,20,12,35,135,Morning,Afternoon
4,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-963,11,2,8,50,11,10,140,Early Morning,Morning


In [200]:
df.drop(['dep_hour','dep_min','arr_hour','arr_min'],axis=1,inplace=True)

In [201]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,flight_code,day_of_journey,month_of_journey,duration_in_min,dep_time,arr_time
0,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8709,11,2,130,Evening,Night
1,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8157,11,2,140,Early Morning,Early Morning
2,AirAsia,Delhi,Mumbai,5956,economy,zero,NAN,I5-764,11,2,130,Early Morning,Early Morning
3,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-995,11,2,135,Morning,Afternoon
4,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-963,11,2,140,Early Morning,Morning


In [202]:
df.month_of_journey =  df.month_of_journey.astype('int64')

In [203]:
def month(m):
    if m == 2:
        month = 'February'
    elif m ==3:
        month = 'March'
    else:
        month = 'Nan'
    return month

In [204]:
df['month'] = df['month_of_journey'].apply(month)

In [205]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,flight_code,day_of_journey,month_of_journey,duration_in_min,dep_time,arr_time,month
0,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8709,11,2,130,Evening,Night,February
1,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8157,11,2,140,Early Morning,Early Morning,February
2,AirAsia,Delhi,Mumbai,5956,economy,zero,NAN,I5-764,11,2,130,Early Morning,Early Morning,February
3,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-995,11,2,135,Morning,Afternoon,February
4,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-963,11,2,140,Early Morning,Morning,February


In [206]:
df.day_of_journey =  df.day_of_journey.astype('int64')

In [207]:
def days(d):
    if d == 1 or d == 8 or d == 15 or d == 22:
        day = 'Tuesday'
    elif d ==2 or d == 9 or d == 16 or d == 23:
        day = 'Wednesday'
    elif d==3 or d == 10 or d == 17 or d == 24:
        day = 'Thursday'
    elif d == 4 or d==11 or d == 18 or d == 25:
        day = 'Friday'
    elif d == 5 or d == 12 or d == 19 or d == 26:
        day = 'Saturday'
    elif d == 6 or d == 13 or d == 20 or d == 27:
        day = 'Sunday'
    elif d == 7 or d == 14 or d == 21 or d == 28:
        day = 'Monday'
    elif d == 29:
        day = 'Tuesday'
    elif d == 30:
        day = 'Wednesday'
    elif d == 31:
        day = 'Thursday'
    else:
        day = 'Nan'
        
    return day

In [208]:
df['day'] = df['day_of_journey'].apply(days)

In [209]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,flight_code,day_of_journey,month_of_journey,duration_in_min,dep_time,arr_time,month,day
0,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8709,11,2,130,Evening,Night,February,Friday
1,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8157,11,2,140,Early Morning,Early Morning,February,Friday
2,AirAsia,Delhi,Mumbai,5956,economy,zero,NAN,I5-764,11,2,130,Early Morning,Early Morning,February,Friday
3,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-995,11,2,135,Morning,Afternoon,February,Friday
4,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-963,11,2,140,Early Morning,Morning,February,Friday


In [210]:
df.day.unique()

array(['Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday',
       'Thursday'], dtype=object)

In [211]:
df.drop(['day_of_journey','month_of_journey'],axis=1,inplace=True)

In [212]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,flight_code,duration_in_min,dep_time,arr_time,month,day
0,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8709,130,Evening,Night,February,Friday
1,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,SG-8157,140,Early Morning,Early Morning,February,Friday
2,AirAsia,Delhi,Mumbai,5956,economy,zero,NAN,I5-764,130,Early Morning,Early Morning,February,Friday
3,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-995,135,Morning,Afternoon,February,Friday
4,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,UK-963,140,Early Morning,Morning,February,Friday


In [213]:
df.drop(['flight_code'],axis=1,inplace=True)

In [214]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,duration_in_min,dep_time,arr_time,month,day
0,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,130,Evening,Night,February,Friday
1,SpiceJet,Delhi,Mumbai,5953,economy,zero,NAN,140,Early Morning,Early Morning,February,Friday
2,AirAsia,Delhi,Mumbai,5956,economy,zero,NAN,130,Early Morning,Early Morning,February,Friday
3,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,135,Morning,Afternoon,February,Friday
4,Vistara,Delhi,Mumbai,5955,economy,zero,NAN,140,Early Morning,Morning,February,Friday


In [215]:
df.stops.unique()

array(['zero', 'one', 'two_or_more'], dtype=object)

In [216]:
df['stops']=df['stops'].map({'zero':0, 'one':1, 'two_or_more':2}).astype(str)

In [217]:
df.stops =  df.stops.astype('int64')

In [218]:
df['class']=df['class'].map({'economy':0, 'business':1}).astype(int)

In [219]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,stops_via,duration_in_min,dep_time,arr_time,month,day
0,SpiceJet,Delhi,Mumbai,5953,0,0,NAN,130,Evening,Night,February,Friday
1,SpiceJet,Delhi,Mumbai,5953,0,0,NAN,140,Early Morning,Early Morning,February,Friday
2,AirAsia,Delhi,Mumbai,5956,0,0,NAN,130,Early Morning,Early Morning,February,Friday
3,Vistara,Delhi,Mumbai,5955,0,0,NAN,135,Morning,Afternoon,February,Friday
4,Vistara,Delhi,Mumbai,5955,0,0,NAN,140,Early Morning,Morning,February,Friday


In [220]:
df.isna().sum()

airline            0
from               0
to                 0
price              0
class              0
stops              0
stops_via          0
duration_in_min    0
dep_time           0
arr_time           0
month              0
day                0
dtype: int64

In [221]:
df.drop(['stops_via'],axis=1,inplace=True)

In [222]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,duration_in_min,dep_time,arr_time,month,day
0,SpiceJet,Delhi,Mumbai,5953,0,0,130,Evening,Night,February,Friday
1,SpiceJet,Delhi,Mumbai,5953,0,0,140,Early Morning,Early Morning,February,Friday
2,AirAsia,Delhi,Mumbai,5956,0,0,130,Early Morning,Early Morning,February,Friday
3,Vistara,Delhi,Mumbai,5955,0,0,135,Morning,Afternoon,February,Friday
4,Vistara,Delhi,Mumbai,5955,0,0,140,Early Morning,Morning,February,Friday


In [223]:
def week(w):
    if w == 'Saturday' or w == 'Sunday':
        weeks = 'Weekend'
    elif w == 'Monday' or w == 'Tuesday' or w == 'Wednesday' or w == 'Thursday' or w == 'Friday':
        weeks = 'Weekday'
    else:
        weeks = 'NaN'
        
    return weeks

In [224]:
df['weekend/weekdays'] = df['day'].apply(week)

In [225]:
df.head()

Unnamed: 0,airline,from,to,price,class,stops,duration_in_min,dep_time,arr_time,month,day,weekend/weekdays
0,SpiceJet,Delhi,Mumbai,5953,0,0,130,Evening,Night,February,Friday,Weekday
1,SpiceJet,Delhi,Mumbai,5953,0,0,140,Early Morning,Early Morning,February,Friday,Weekday
2,AirAsia,Delhi,Mumbai,5956,0,0,130,Early Morning,Early Morning,February,Friday,Weekday
3,Vistara,Delhi,Mumbai,5955,0,0,135,Morning,Afternoon,February,Friday,Weekday
4,Vistara,Delhi,Mumbai,5955,0,0,140,Early Morning,Morning,February,Friday,Weekday
