In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('scraped_flight_data.csv')

In [3]:
df.head()

Unnamed: 0,Airline,Source,Destination,Departure,Arrival,Number of Stops,Stopover Details,Price,Class,Date
0,Lufthansa • Operated by Air Canada,Toronto Pearson Intl,Bengaluru Intl,9:20 pm,1:20 am+2,1 stop,", 1h 50m layover, <b>Frankfurt am Main</b>","C$ 1,934",Economy,2024-07-08
1,Multiple Airlines • Operated by Turkish Airlines,Toronto Pearson Intl,Bengaluru Intl,11:55 pm,6:30 pm+3,3 stops,", 5h 25m layover, <b>Dublin</b>, , 19h 50m lay...","C$ 1,130",Economy,2024-07-08
2,Air Canada • Operated by Lufthansa,Toronto Pearson Intl,Bengaluru Intl,9:20 pm,1:20 am+2,1 stop,", 1h 50m layover, <b>Frankfurt am Main</b>","C$ 1,937",Standard,2024-07-08
3,Air Canada • Operated by Lufthansa,Toronto Pearson Intl,Bengaluru Intl,8:25 pm,1:20 am+2,2 stops,", 0h 45m layover, <b>Munich Franz Josef Straus...","C$ 1,948",Economy,2024-07-08
4,"Air Canada, Lufthansa",Toronto Pearson Intl,Bengaluru Intl,9:20 pm,1:20 am+2,1 stop,", 1h 50m layover, <b>Frankfurt am Main</b>","C$ 2,673",Economy,2024-07-08


In [4]:
# Define a function to clean and split the data
def clean_split_stopover(details):
    if isinstance(details, str):
        # Split the details by the delimiter ", , "
        parts = details.split(', , ')
        cleaned_parts = []
        for part in parts:
            # Extract the layover time and airport details
            time, airport = part.split(' layover, <b>')
            airport = airport.replace('</b>', '')
            cleaned_parts.append((time.strip(), airport.strip()))
        return cleaned_parts
    else:
        return []

# Apply the function to the 'Stopover Details' column
df['Cleaned Stopover Details'] = df['Stopover Details'].apply(clean_split_stopover)

# Convert the cleaned details into separate columns
stopover_df = df['Cleaned Stopover Details'].apply(pd.Series)

# Extract layover time and airport into separate columns
flat_df = pd.DataFrame()

for col in stopover_df.columns:
    temp_list = stopover_df[col].apply(lambda x: x if isinstance(x, tuple) else (np.nan, np.nan))
    temp_df = pd.DataFrame(temp_list.tolist(), columns=[f'Stopover_{col+1}_Time', f'Stopover_{col+1}_Airport'])
    flat_df = pd.concat([flat_df, temp_df], axis=1)

# Remove leading commas in Stopover_1_Time
flat_df['Stopover_1_Time'] = flat_df['Stopover_1_Time'].str.replace('^, ', '', regex=True)

# Function to convert time string to minutes
def time_to_minutes(time_str):
    if pd.isna(time_str):
        return 0
    hours, minutes = 0, 0
    if 'h' in time_str:
        hours = int(time_str.split('h')[0].strip())
        time_str = time_str.split('h')[1].strip()
    if 'm' in time_str:
        minutes = int(time_str.split('m')[0].strip())
    return hours * 60 + minutes

# Calculate total stopover time
flat_df['Total_Stopover_Time'] = flat_df[[col for col in flat_df.columns if 'Time' in col]].applymap(time_to_minutes).sum(axis=1)

# Convert total time back to hours and minutes
def minutes_to_time(minutes):
    hours = minutes // 60
    minutes = minutes % 60
    return f'{hours}h {minutes}m'

flat_df['Total_Stopover_Time'] = flat_df['Total_Stopover_Time'].apply(minutes_to_time)

  flat_df['Total_Stopover_Time'] = flat_df[[col for col in flat_df.columns if 'Time' in col]].applymap(time_to_minutes).sum(axis=1)


In [5]:
# Combine flat_df with the original df
df = pd.concat([df, flat_df], axis=1)

In [6]:
df = df.drop(columns=['Cleaned Stopover Details','Stopover Details'])

In [7]:
# Split the 'Airline' column at '• Operated by', handle missing parts by filling with None
split_df = df['Airline'].str.split('• Operated by', n=1, expand=True)

# Assign split data to new columns and fill missing values
df['Airline'] = split_df[0].str.strip()
df['Operated'] = split_df[1].str.strip().fillna('None')


In [8]:
# Remove the 'C$' prefix and commas, and convert to numeric
df['price in CAD'] = df['Price'].str.replace('C$', '').str.replace(',', '').astype(int)

# Drop the old 'Price' column
df = df.drop(columns=['Price'])

In [9]:
from datetime import datetime, timedelta

# Function to calculate days_left
def calculate_days_left(flight_date):
    start_date = datetime(2024, 6, 22)
    current_date = datetime.strptime(flight_date, '%Y-%m-%d')
    return (current_date - start_date).days + 1

# Apply the function to create the new column
df['days_left'] = df['Date'].apply(calculate_days_left)

In [10]:
df.to_csv('Final3.csv', index=False)

In [11]:
df = pd.read_csv('Final3.csv')

In [12]:
# 1. Modify the 'Airline' column
df['Airline'] = df['Airline'].apply(lambda x: 'Multiple Airlines' if ',' in x else x)

# 2. Modify the 'Number of Stops' column
df['Number of Stops'] = df['Number of Stops'].str.extract('(\d+)').fillna(0).astype(int)

# 3. Categorize the 'Class' column
def categorize_class(class_name):
    class_name = class_name.lower()
    if 'business' in class_name or any(keyword in class_name for keyword in ['executive','upper class']):
        return 'Business Class'
    elif 'economy' in class_name or any(keyword in class_name for keyword in ['classic','flex','comfort','latitude','light','basic', 'best', 'eco', 'discount','promotion','best buy', 'plus','saver','best offer', 'eco saver', 'ultrasaver', 'standard']):
        return 'Economy Class'
    elif 'first' in class_name:
        return 'First Class'
    elif 'premium' in class_name:
        return 'Premium Economy'
    else:
        return 'Other'

df['Class'] = df['Class'].apply(categorize_class)

In [13]:
import datetime

# Function to convert time to 24-hour format
def convert_to_24hr(time_str):
    return datetime.datetime.strptime(time_str, '%I:%M %p').strftime('%H:%M')

# Function to extract and convert arrival time
def extract_arrival_info(arrival_str):
    if '+' in arrival_str:
        time_part, day_increment = arrival_str.split('+')
        day_increment = int(day_increment)
    else:
        time_part = arrival_str
        day_increment = 0
    arrival_24hr = datetime.datetime.strptime(time_part.strip(), '%I:%M %p').strftime('%H:%M')
    return arrival_24hr, day_increment

# Convert departure time to 24-hour format
df['Departure_24hr'] = df['Departure'].apply(convert_to_24hr)

# Extract and convert arrival time and day offset
df['Arrival_24hr'], df['Arrival_Day_Offset'] = zip(*df['Arrival'].apply(extract_arrival_info))


In [14]:
# Function to convert 'xh ym' to minutes
def convert_to_minutes(time_str):
    if pd.isna(time_str):
        return 0
    hours, minutes = 0, 0
    if 'h' in time_str:
        hours = int(time_str.split('h')[0].strip())
        time_str = time_str.split('h')[1].strip()
    if 'm' in time_str:
        minutes = int(time_str.split('m')[0].strip())
    return hours * 60 + minutes

In [15]:
df['Total_Stopover_Time'] = df['Total_Stopover_Time'].apply(convert_to_minutes)

In [16]:
# Convert columns to appropriate datatypes
df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime
df['price in CAD'] = df['price in CAD'].astype(float)  # Convert to float
df['days_left'] = df['days_left'].astype(int)  # Convert to integer
df['Number of Stops'] = df['Number of Stops'].astype(int)  # Convert to integer
df['Arrival_Day_Offset'] = df['Arrival_Day_Offset'].astype(int)  # Convert to integer

In [17]:
df.drop(['Departure','Arrival','Stopover_1_Time','Stopover_1_Airport','Stopover_2_Time','Stopover_2_Airport','Stopover_3_Time',
         'Stopover_3_Airport','Operated'],axis=1,inplace=True)

In [18]:
df.to_csv('AirlineData.csv',index=False)