In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
df = pd.read_csv('Final3.csv')

In [7]:
df.head()

Unnamed: 0,Airline,Source,Destination,Departure,Arrival,Number of Stops,Class,Date,Stopover_1_Time,Stopover_1_Airport,Stopover_2_Time,Stopover_2_Airport,Stopover_3_Time,Stopover_3_Airport,Total_Stopover_Time,Operated,price in CAD,days_left
0,Etihad Airways,Toronto Pearson Intl,Bengaluru Intl,10:10 PM,3:05 am+2,1 stop,Economy Comfort,2024-06-02,2h 10m,Abu Dhabi Zayed Intl,,,,,2h 10m,,2340,1
1,Delta,Toronto Pearson Intl,Bengaluru Intl,10:40 PM,11:15 pm+2,1 stop,Basic Economy,2024-06-02,22h 15m,Amsterdam Schiphol,,,,,22h 15m,KLM,1347,1
2,"Lufthansa, Vistara",Toronto Pearson Intl,Bengaluru Intl,9:20 PM,7:45 am+2,2 stops,Economy Basic,2024-06-02,2h 05m,Frankfurt am Main,4h 55m,Mumbai Chhatrapati Shivaji Intl,,,7h 0m,Air Canada,1934,1
3,"Lufthansa, Vistara",Toronto Pearson Intl,Bengaluru Intl,9:20 PM,6:50 am+2,2 stops,Economy Basic,2024-06-02,1h 55m,Frankfurt am Main,4h 25m,Hyderabad Rajiv Gandhi Intl,,,6h 20m,Air Canada,2291,1
4,"Air Canada, Air India",Toronto Pearson Intl,Bengaluru Intl,11:40 PM,6:25 am+2,2 stops,Flex,2024-06-02,1h 25m,London Heathrow,1h 30m,New Delhi Indira Gandhi Intl,,,2h 55m,,2661,1


## Airline, Number of stops, and Class

In [10]:

# 1. Modify the 'Airline' column
df['Airline'] = df['Airline'].apply(lambda x: 'Multiple Airlines' if ',' in x else x)

# 2. Modify the 'Number of Stops' column
df['Number of Stops'] = df['Number of Stops'].str.extract('(\d+)').fillna(0).astype(int)

# 3. Categorize the 'Class' column
def categorize_class(class_name):
    class_name = class_name.lower()
    if 'business' in class_name or any(keyword in class_name for keyword in ['executive','upper class']):
        return 'Business Class'
    elif 'economy' in class_name or any(keyword in class_name for keyword in ['classic','flex','comfort','latitude','light','basic', 'best', 'eco', 'discount','promotion','best buy', 'plus','saver','best offer', 'eco saver', 'ultrasaver', 'standard']):
        return 'Economy Class'
    elif 'first' in class_name:
        return 'First Class'
    elif 'premium' in class_name:
        return 'Premium Economy'
    else:
        return 'Other'

df['Class'] = df['Class'].apply(categorize_class)

# Display the modified dataframe
#print(df.head())


In [14]:
df.shape

(384663, 19)

In [16]:
import datetime

# Function to convert time to 24-hour format
def convert_to_24hr(time_str):
    return datetime.datetime.strptime(time_str, '%I:%M %p').strftime('%H:%M')

# Function to extract and convert arrival time
def extract_arrival_info(arrival_str):
    if '+' in arrival_str:
        time_part, day_increment = arrival_str.split('+')
        day_increment = int(day_increment)
    else:
        time_part = arrival_str
        day_increment = 0
    arrival_24hr = datetime.datetime.strptime(time_part.strip(), '%I:%M %p').strftime('%H:%M')
    return arrival_24hr, day_increment

# Convert departure time to 24-hour format
df['Departure_24hr'] = df['Departure'].apply(convert_to_24hr)

# Extract and convert arrival time and day offset
df['Arrival_24hr'], df['Arrival_Day_Offset'] = zip(*df['Arrival'].apply(extract_arrival_info))

#print(df)


In [26]:
df.head()

Unnamed: 0,Airline,Source,Destination,Departure,Arrival,Number of Stops,Class,Date,Stopover_1_Time,Stopover_1_Airport,Stopover_2_Time,Stopover_2_Airport,Stopover_3_Time,Stopover_3_Airport,Total_Stopover_Time,Operated,price in CAD,days_left,Departure_24hr,Arrival_24hr,Arrival_Day_Offset
0,Etihad Airways,Toronto Pearson Intl,Bengaluru Intl,10:10 PM,3:05 am+2,1,Economy Class,2024-06-02,2h 10m,Abu Dhabi Zayed Intl,,,,,2h 10m,,2340,1,22:10,03:05,2
1,Delta,Toronto Pearson Intl,Bengaluru Intl,10:40 PM,11:15 pm+2,1,Economy Class,2024-06-02,22h 15m,Amsterdam Schiphol,,,,,22h 15m,KLM,1347,1,22:40,23:15,2
2,Multiple Airlines,Toronto Pearson Intl,Bengaluru Intl,9:20 PM,7:45 am+2,2,Economy Class,2024-06-02,2h 05m,Frankfurt am Main,4h 55m,Mumbai Chhatrapati Shivaji Intl,,,7h 0m,Air Canada,1934,1,21:20,07:45,2
3,Multiple Airlines,Toronto Pearson Intl,Bengaluru Intl,9:20 PM,6:50 am+2,2,Economy Class,2024-06-02,1h 55m,Frankfurt am Main,4h 25m,Hyderabad Rajiv Gandhi Intl,,,6h 20m,Air Canada,2291,1,21:20,06:50,2
4,Multiple Airlines,Toronto Pearson Intl,Bengaluru Intl,11:40 PM,6:25 am+2,2,Economy Class,2024-06-02,1h 25m,London Heathrow,1h 30m,New Delhi Indira Gandhi Intl,,,2h 55m,,2661,1,23:40,06:25,2


## Stop Over Time in Minutes 

In [28]:
# Function to convert 'xh ym' to minutes
def convert_to_minutes(time_str):
    if pd.isna(time_str):
        return 0
    hours, minutes = 0, 0
    if 'h' in time_str:
        hours = int(time_str.split('h')[0].strip())
        time_str = time_str.split('h')[1].strip()
    if 'm' in time_str:
        minutes = int(time_str.split('m')[0].strip())
    return hours * 60 + minutes

In [30]:
df['Total_Stopover_Time'] = df['Total_Stopover_Time'].apply(convert_to_minutes)

In [43]:
df.dtypes

Airline                        object
Source                         object
Destination                    object
Departure                      object
Arrival                        object
Number of Stops                 int64
Class                          object
Date                   datetime64[ns]
Stopover_1_Time                object
Stopover_1_Airport             object
Stopover_2_Time                object
Stopover_2_Airport             object
Stopover_3_Time                object
Stopover_3_Airport             object
Total_Stopover_Time             int64
Operated                       object
price in CAD                  float64
days_left                       int64
Departure_24hr                 object
Arrival_24hr                   object
Arrival_Day_Offset              int64
dtype: object

In [41]:
# Convert columns to appropriate datatypes
df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime
df['price in CAD'] = df['price in CAD'].astype(float)  # Convert to float
df['days_left'] = df['days_left'].astype(int)  # Convert to integer
df['Number of Stops'] = df['Number of Stops'].astype(int)  # Convert to integer
df['Arrival_Day_Offset'] = df['Arrival_Day_Offset'].astype(int)  # Convert to integer


In [45]:
df.columns

Index(['Airline', 'Source', 'Destination', 'Departure', 'Arrival',
       'Number of Stops', 'Class', 'Date', 'Stopover_1_Time',
       'Stopover_1_Airport', 'Stopover_2_Time', 'Stopover_2_Airport',
       'Stopover_3_Time', 'Stopover_3_Airport', 'Total_Stopover_Time',
       'Operated', 'price in CAD', 'days_left', 'Departure_24hr',
       'Arrival_24hr', 'Arrival_Day_Offset'],
      dtype='object')

## Drop Columns

In [48]:
df.drop(['Departure','Arrival','Stopover_1_Time','Stopover_1_Airport','Stopover_2_Time','Stopover_2_Airport','Stopover_3_Time',
         'Stopover_3_Airport','Operated'],axis=1,inplace=True)

In [52]:
df.to_csv('AirlineData.csv',index=False)