In [1]:
%load_ext kedro.ipython

In [2]:
business = catalog.load("business")

In [3]:
economy = catalog.load("economy")

In [4]:
economy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206774 entries, 0 to 206773
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        206774 non-null  object
 1   airline     206774 non-null  object
 2   ch_code     206774 non-null  object
 3   num_code    206774 non-null  int64 
 4   dep_time    206774 non-null  object
 5   from        206774 non-null  object
 6   time_taken  206774 non-null  object
 7   stop        206774 non-null  object
 8   arr_time    206774 non-null  object
 9   to          206774 non-null  object
 10  price       206774 non-null  object
dtypes: int64(1), object(10)
memory usage: 17.4+ MB


In [5]:
business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93487 entries, 0 to 93486
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        93487 non-null  object
 1   airline     93487 non-null  object
 2   ch_code     93487 non-null  object
 3   num_code    93487 non-null  int64 
 4   dep_time    93487 non-null  object
 5   from        93487 non-null  object
 6   time_taken  93487 non-null  object
 7   stop        93487 non-null  object
 8   arr_time    93487 non-null  object
 9   to          93487 non-null  object
 10  price       93487 non-null  object
dtypes: int64(1), object(10)
memory usage: 7.8+ MB


In [31]:
business.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price
0,11-02-2022,Air India,AI,868,18:00,Delhi,02h 00m,non-stop,20:00,Mumbai,25612
1,11-02-2022,Air India,AI,624,19:00,Delhi,02h 15m,non-stop,21:15,Mumbai,25612
2,11-02-2022,Air India,AI,531,20:00,Delhi,24h 45m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,Mumbai,42220
3,11-02-2022,Air India,AI,839,21:25,Delhi,26h 30m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:55,Mumbai,44450
4,11-02-2022,Air India,AI,544,17:15,Delhi,06h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:55,Mumbai,46690


In [4]:
import pandas as pd
from datetime import datetime

def concat_2_tables(x: pd.DataFrame) -> pd.DataFrame:
    return pd.concat(x, ignore_index=True)

def to_price(x: pd.DataFrame) -> pd.Series:
    return x.str.replace(",", "").astype(float)

def add_class(x: pd.DataFrame, y: str) -> pd.DataFrame:    
    if y == "economy":
        x["class"] = "economy"
        return x
    elif y == "business":
        x["class"] = "business"
        return x

def label_time_of_day(x: pd.Series) -> pd.Series:
    def assign_time_of_day(time):
        if isinstance(time, str):
            time = datetime.strptime(time, "%H:%M").time()  
            
        if time >= datetime.strptime("05:00", "%H:%M").time() and time < datetime.strptime("08:00", "%H:%M").time():
            return "Early_morning"
        elif time >= datetime.strptime("08:00", "%H:%M").time() and time < datetime.strptime("12:00", "%H:%M").time():
            return "Morning"
        elif time >= datetime.strptime("12:00", "%H:%M").time() and time < datetime.strptime("17:00", "%H:%M").time():
            return "Afternoon"
        elif time >= datetime.strptime("17:00", "%H:%M").time() and time < datetime.strptime("21:00", "%H:%M").time():
            return "Evening"
        else:
            return "Night"

    return x.apply(lambda time: assign_time_of_day(time))

        
def create_flights_code(x: pd.DataFrame) -> pd.DataFrame:
    x["flights"] = x["ch_code"].astype(str) + "-" + x["num_code"].astype(str)
    x.drop(columns=["ch_code", "num_code"], inplace=True)
    return x


def days_until_departure(data: pd.DataFrame) -> pd.Series:
    purchase_day = datetime(2022, 2, 11).date()
    
    def calculate_days(row_date):
        if row_date == purchase_day:
            return 1
        delta = row_date - purchase_day
        return delta.days

    data["date"] = pd.to_datetime(data["date"], dayfirst=True, errors='coerce').dt.date  

    if data["date"].isnull().any():
        raise ValueError("Error")
    return data["date"].apply(calculate_days)

def stops(x: pd.Series) -> pd.Series:
    def process_stop_value(val):
        val = str(val).strip()  
        if val == "non-stop":
            return "zero"
        elif val in ["1-stop", "1-stop Via IDR", "1-stop Via IXU"]:
            return "one"
        elif val == "2+-stop":
            return "two_or_more"
        else:
            return "unknown"  
    
    return x.apply(process_stop_value)


In [5]:
business = add_class(business, "business")
economy= add_class(economy, "economy")
data = concat_2_tables([business, economy])

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300261 entries, 0 to 300260
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        300261 non-null  object
 1   airline     300261 non-null  object
 2   ch_code     300261 non-null  object
 3   num_code    300261 non-null  int64 
 4   dep_time    300261 non-null  object
 5   from        300261 non-null  object
 6   time_taken  300261 non-null  object
 7   stop        300261 non-null  object
 8   arr_time    300261 non-null  object
 9   to          300261 non-null  object
 10  price       300261 non-null  object
 11  class       300261 non-null  object
dtypes: int64(1), object(11)
memory usage: 27.5+ MB


In [163]:
data = create_flights_code(data)

In [164]:
data["date"] = days_until_departure(data)

In [119]:
data["stop"] = stops(data["stop"])

In [160]:
data.head(1000)

In [120]:
data["dep_time"] = label_time_of_day(data["dep_time"])
data["arr_time"] = label_time_of_day(data["arr_time"])