# We are going to write a script that will web scrape the latest data analytics positions from LinkedIn in a Location we choose:

---

### Import every library ever created

In [1]:
# this is the library that we will use to create break times in order to mimic human behaviour
import time
from getpass import getpass


# Juicy stuff- these are the Classes we will use for interaction with a webpage:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager


# library for interacting with the operating system
import os

# you know pandas it's your best buddy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

# library for directory location:

import pathlib
from os.path import join

#Ignore warning -- Some methods are going to be deprecated and I didn't change all (mainly in the function scrapper)
import warnings
warnings.filterwarnings('ignore')

### Load a driver and the website

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [3]:
driver.get('https://ber.berlin-airport.de/en/flying/departures-arrivals.html?date=2024-06-27&flightType=D&search=')

### Click cookies

In [4]:
try:
    cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
    cookie_button.click()
    time.sleep(2.5)
except:
    print("Cookies already accepted.")

### Try scroll to the end

In [142]:
page = driver.find_element(By.CSS_SELECTOR, "body[class^='page basicpage']")
page.click()
time.sleep(1)

flight_viewer = driver.find_element(By.CSS_SELECTOR, "body[class^='page basicpage']") #page = driver.find_element_by_css_selector("a[class^='disabled ember-view']")
flight_viewer.send_keys(Keys.END)

### Click load more as many times until all flights are listed

In [143]:
load_more = driver.find_element(By.CLASS_NAME, "cmp-flightlist__action-link")
load_more.click()

### Get the html page source

In [8]:
html = driver.page_source
soup = BeautifulSoup(html)

### Get all the flight links and save to a list

In [9]:
flights = soup.find_all('div', attrs= {'class' : 'cmp-flightlist__list__items'})
result_set_html = ''.join(str(tag) for tag in flights)
new_flights = BeautifulSoup(result_set_html, 'html.parser')
base_url = "https://ber.berlin-airport.de"
a_tags = new_flights.find_all('a')
href_list = [base_url + tag['href'] for tag in a_tags if 'href' in tag.attrs]

### Save the links to a file for safe keeps

In [10]:
date_of_scrape = input("What Day: ")
arr_dep = input("Arrival or Departure: ")
file_path = f"/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Href Links/{arr_dep}_links_{date_of_scrape}.txt"

with open(file_path, 'w') as file:
    for href in href_list:
        file.write(f"{href}\n")

What Day: 171
Arrival or Departure: arrival


### Open the links file into a list

In [8]:
with open('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Href Links/departure_links_161.txt', 'r') as file:
    href_list_dep = [line.strip() for line in file]

In [15]:
with open('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Href Links/arrival_links_161.txt', 'r') as file:
    href_list_arr = [line.strip() for line in file]

### Test for elements

In [16]:
driver.get(href_list_arr[154])

In [60]:
href_list_dep[39]

'https://ber.berlin-airport.de/en/flying/departures-arrivals/flugdetails.html?flightId=1278875'

In [25]:
href_list_arr[0]

'https://ber.berlin-airport.de/en/flying/departures-arrivals/flugdetails.html?flightId=1248662'

In [42]:
new_element = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_expected_time']").text

In [43]:
new_element

'18/06/2024 13:00'

## Run to create a database

### Departures

In [11]:
flight_data_dep = []
date_of_scrape = input("What Day: ")
for_loading = 0

with open(f"/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Href Links/departure_links_{date_of_scrape}.txt", 'r') as file:
    href_list_dep = [line.strip() for line in file]

for link in href_list_dep:
    
    driver.get(link)
    time.sleep(1)
    
    try:
        cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
        cookie_button.click()
        time.sleep(1.5)
    except:
        pass
    
    driver.execute_script("document.body.style.zoom='100%'")

    driver.execute_script("document.body.style.zoom='67%'")
    time.sleep(1.3)
    
    
    depart_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='dep_airport_name']").text
    arrival_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='arr_airport_name']").text
    date_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_scheduled_time']").text
    actual_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='dep_expected_time']").text
    terminal = driver.find_element(By.CSS_SELECTOR,"i[data-flight-data^='terminal']").text
    check_in = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='checkin_counter']").text
    gate = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='gate']").text
    airline = driver.find_element(By.CSS_SELECTOR,"span[data-flight-data^='airline_name']").text
    flight_nr = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='flight_number_details']").text
    aircraft_type = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_type']").text
    plane_reg = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_reg']").text
    status = driver.find_element(By.CSS_SELECTOR,"u[data-flight-data^='flight_status_label']").text

    flight_idx_list_dep = [depart_air, arrival_air, date_time, actual_time, terminal, check_in, gate, flight_nr, aircraft_type, plane_reg, status, airline]
    
    flight_data_dep.append(flight_idx_list_dep)
    
    
    for_loading = for_loading+1
    print(f"{for_loading} out of {len(href_list_dep)} scraped")
    
departure_dataframe = pd.DataFrame(flight_data_dep,
                                 columns = ["Departure", "Destination", "Date and Time", "Actual Departure", "Terminal", "Check In" , "Gate", "Flight Number", "Aircraft Type", "Plane Reg", "Flight Status", "Airline" ]
                                )

departure_dataframe.to_excel(f"/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_{date_of_scrape}.xlsx",
                           sheet_name='Flights',
                           index= False)

What Day: 171
1 out of 251 scraped
2 out of 251 scraped
3 out of 251 scraped
4 out of 251 scraped
5 out of 251 scraped
6 out of 251 scraped
7 out of 251 scraped
8 out of 251 scraped
9 out of 251 scraped
10 out of 251 scraped
11 out of 251 scraped
12 out of 251 scraped
13 out of 251 scraped
14 out of 251 scraped
15 out of 251 scraped
16 out of 251 scraped
17 out of 251 scraped
18 out of 251 scraped
19 out of 251 scraped
20 out of 251 scraped
21 out of 251 scraped
22 out of 251 scraped
23 out of 251 scraped
24 out of 251 scraped
25 out of 251 scraped
26 out of 251 scraped
27 out of 251 scraped
28 out of 251 scraped
29 out of 251 scraped
30 out of 251 scraped
31 out of 251 scraped
32 out of 251 scraped
33 out of 251 scraped
34 out of 251 scraped
35 out of 251 scraped
36 out of 251 scraped
37 out of 251 scraped
38 out of 251 scraped
39 out of 251 scraped
40 out of 251 scraped
41 out of 251 scraped
42 out of 251 scraped
43 out of 251 scraped
44 out of 251 scraped
45 out of 251 scraped
46 ou

Arrivals

In [12]:
flight_data_arr = []
date_of_scrape = input("What Date: ")
for_loading = 0

with open(f"/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Href Links/arrival_links_{date_of_scrape}.txt", 'r') as file:
    href_list_arr = [line.strip() for line in file]

for link in href_list_arr:
    
    driver.get(link)
    time.sleep(1)
    
    try:
        cookie_button = driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")
        cookie_button.click()
        time.sleep(1.5)
    except:
        pass
    
    driver.execute_script("document.body.style.zoom='100%'")

    driver.execute_script("document.body.style.zoom='67%'")
    time.sleep(1.3)
    
    
    depart_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='dep_airport_name']").text
    arrival_air = driver.find_element(By.CSS_SELECTOR,"h2[data-flight-data^='arr_airport_name']").text
    date_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_scheduled_time']").text
    actual_time = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_expected_time']").text
    terminal = driver.find_element(By.CSS_SELECTOR,"i[data-flight-data^='terminal_arr']").text
    check_in = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='arr_belt']").text
    gate = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='gate_arr']").text
    airline = driver.find_element(By.CSS_SELECTOR,"span[data-flight-data^='airline_name']").text
    flight_nr = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='flight_number_details']").text
    aircraft_type = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_type']").text
    plane_reg = driver.find_element(By.CSS_SELECTOR,"strong[data-flight-data^='aircraft_reg']").text
    status = driver.find_element(By.CSS_SELECTOR,"u[data-flight-data^='flight_status_label']").text

    flight_idx_list_arr = [depart_air, arrival_air, date_time, actual_time, terminal, check_in, gate, flight_nr, aircraft_type, plane_reg, status, airline]
    
    flight_data_arr.append(flight_idx_list_arr)
    
    
    for_loading = for_loading+1
    print(f"{for_loading} out of {len(href_list_arr)} scraped")
    
arrival_dataframe = pd.DataFrame(flight_data_arr,
                                 columns = ["Departure", "Destination", "Date and Time", "Actual Arrival", "Terminal", "Baggage Claim" , "Gate", "Flight Number", "Aircraft Type", "Plane Reg", "Flight Status", "Airline" ]
                                )

arrival_dataframe.to_excel(f"/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_{date_of_scrape}.xlsx",
                           sheet_name='Flights',
                           index= False)

What Date: 171
1 out of 249 scraped
2 out of 249 scraped
3 out of 249 scraped
4 out of 249 scraped
5 out of 249 scraped
6 out of 249 scraped
7 out of 249 scraped
8 out of 249 scraped
9 out of 249 scraped
10 out of 249 scraped
11 out of 249 scraped
12 out of 249 scraped
13 out of 249 scraped
14 out of 249 scraped
15 out of 249 scraped
16 out of 249 scraped
17 out of 249 scraped
18 out of 249 scraped
19 out of 249 scraped
20 out of 249 scraped
21 out of 249 scraped
22 out of 249 scraped
23 out of 249 scraped
24 out of 249 scraped
25 out of 249 scraped
26 out of 249 scraped
27 out of 249 scraped
28 out of 249 scraped
29 out of 249 scraped
30 out of 249 scraped
31 out of 249 scraped
32 out of 249 scraped
33 out of 249 scraped
34 out of 249 scraped
35 out of 249 scraped
36 out of 249 scraped
37 out of 249 scraped
38 out of 249 scraped
39 out of 249 scraped
40 out of 249 scraped
41 out of 249 scraped
42 out of 249 scraped
43 out of 249 scraped
44 out of 249 scraped
45 out of 249 scraped
46 o

### Close Driver

In [11]:
driver.close()

---

### Combining and Cleaning the dataframes

#### Arrivals

In [15]:
df_arr1 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_16.xlsx')
df_arr2 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_17.xlsx')
df_arr3 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_24.xlsx')
df_arr4 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_25.xlsx')
df_arr5 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_26.xlsx')
df_arr6 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_27.xlsx')
df_arr7 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_28.xlsx')
df_arr8 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_29.xlsx')
df_arr9 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_01.xlsx')
df_arr10 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_02.xlsx')
df_arr11 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_03.xlsx')
df_arr12 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_06.xlsx')
df_arr13 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_07.xlsx')
df_arr14 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_08.xlsx')
df_arr15 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_09.xlsx')
df_arr16 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_13.xlsx')
df_arr17 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_14.xlsx')
df_arr18 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_15.xlsx')
df_arr19 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_161.xlsx')
df_arr20 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/arrival_info_171.xlsx')

In [16]:
combined_df_arr = pd.concat([df_arr1,df_arr2,df_arr3,df_arr4,df_arr5,df_arr6,df_arr7,df_arr8,df_arr9,df_arr10,df_arr11,df_arr12,df_arr13,df_arr14,df_arr15,df_arr16,df_arr17,df_arr18,df_arr19,df_arr20], ignore_index=True)

In [18]:
combined_df_arr['Destination'] = combined_df_arr['Destination'].replace('Berlin Brandenburg Airport', 'Berlin')
combined_df_arr[['Date', 'Scheduled Time']] = combined_df_arr['Date and Time'].str.split(' ', expand=True)
combined_df_arr[['Date Schedule', 'Actual Arrival Time']] = combined_df_arr['Actual Arrival'].str.split(' ', expand=True)
combined_df_arr.drop(columns=['Date and Time'], inplace=True)
combined_df_arr.drop(columns=['Date Schedule'], inplace=True)
combined_df_arr.drop(columns=['Actual Arrival'], inplace=True)
combined_df_arr['Terminal'] = combined_df_arr['Terminal'].str.replace('T', '')
combined_df_arr = combined_df_arr[combined_df_arr['Departure'] != '#####']
combined_df_arr['Terminal'] = combined_df_arr['Terminal'].astype('Int64')
combined_df_arr['Date'] = pd.to_datetime(combined_df_arr['Date'], dayfirst=True)
conditions = [(combined_df_arr['Actual Arrival Time'].isnull()) | (combined_df_arr['Scheduled Time'] == combined_df_arr['Actual Arrival Time']), combined_df_arr['Scheduled Time'] < combined_df_arr['Actual Arrival Time'],combined_df_arr['Scheduled Time'] > combined_df_arr['Actual Arrival Time']]
choices = ['On-Time', 'Delayed', 'Early']
combined_df_arr['Status'] = np.select(conditions, choices, default='Unknown')

In [19]:
combined_df_arr.shape

(4949, 14)

In [33]:
combined_file = 'arrivals.xlsx'
combined_df_arr.to_excel(combined_file, index=False)

#### Departures

In [20]:
df_dep1 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_16.xlsx')
df_dep2 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_17.xlsx')
df_dep3 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_24.xlsx')
df_dep4 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_25.xlsx')
df_dep5 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_26.xlsx')
df_dep6 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_27.xlsx')
df_dep7 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_28.xlsx')
df_dep8 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_29.xlsx')
df_dep9 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_01.xlsx')
df_dep10 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_02.xlsx')
df_dep11 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_03.xlsx')
df_dep12 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_06.xlsx')
df_dep13 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_07.xlsx')
df_dep14 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_08.xlsx')
df_dep15 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_09.xlsx')
df_dep16 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_13.xlsx')
df_dep17 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_14.xlsx')
df_dep18 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_15.xlsx')
df_dep19 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_161.xlsx')
df_dep20 = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/Data/departure_info_171.xlsx')

In [21]:
combined_df_dep = pd.concat([df_dep1,df_dep2,df_dep3,df_dep4,df_dep5,df_dep6,df_dep7,df_dep8,df_dep9,df_dep10,df_dep11,df_dep12,df_dep13,df_dep14,df_dep15,df_dep16,df_dep17,df_dep18,df_dep19,df_dep20], ignore_index=True)

In [23]:
combined_df_dep['Departure'] = combined_df_dep['Departure'].replace('Berlin Brandenburg Airport', 'Berlin')
combined_df_dep[['Date', 'Scheduled Time']] = combined_df_dep['Date and Time'].str.split(' ', expand=True)
combined_df_dep[['Date Schedule', 'Actual Departure Time']] = combined_df_dep['Actual Departure'].str.split(' ', expand=True)
combined_df_dep.drop(columns=['Date and Time'], inplace=True)
combined_df_dep.drop(columns=['Date Schedule'], inplace=True)
combined_df_dep.drop(columns=['Actual Departure'], inplace=True)
combined_df_dep['Terminal'] = combined_df_dep['Terminal'].str.replace('T', '')
combined_df_dep = combined_df_dep[combined_df_dep['Departure'] != '#####']
combined_df_dep['Terminal'] = combined_df_dep['Terminal'].astype('Int64')
combined_df_dep['Date'] = pd.to_datetime(combined_df_dep['Date'], dayfirst=True)
conditions = [(combined_df_dep['Actual Departure Time'].isnull()) | (combined_df_dep['Scheduled Time'] == combined_df_dep['Actual Departure Time']), combined_df_dep['Scheduled Time'] < combined_df_dep['Actual Departure Time'],combined_df_dep['Scheduled Time'] > combined_df_dep['Actual Departure Time']]
choices = ['On-Time', 'Delayed', 'Early']
combined_df_dep['Status'] = np.select(conditions, choices, default='Unknown')

In [24]:
combined_df_dep.shape

(5070, 14)

In [26]:
# df.head(10) test

# df.dtypes

# date is a datetime object, so we need to convert the date to a string, then
# concatonate with arrival time and then reconvert back into datetime (which always needs a date before an actual time)

combined_df_arr['Date'] = combined_df_arr['Date'].astype(str)

combined_df_arr['Scheduled DateTime'] = pd.to_datetime(combined_df_arr['Date'] + ' ' + combined_df_arr['Scheduled Time'])
combined_df_arr['Actual Arrival DateTime'] = pd.to_datetime(combined_df_arr['Date'] + ' ' + combined_df_arr['Actual Arrival Time'])

# Then we can add it back to the dataframe as a new column:

combined_df_arr['Time Difference'] = combined_df_arr['Actual Arrival DateTime'] - combined_df_arr['Scheduled DateTime']

In [27]:
#convert the time difference to actual minutes:

combined_df_arr['Time Difference (Minutes)'] = (combined_df_arr['Time Difference'].dt.total_seconds() / 60)
combined_df_arr['Time Difference (Minutes)'] = combined_df_arr['Time Difference (Minutes)'].fillna(0).round().astype(int)

In [28]:
combined_df_arr.drop(columns=['Time Difference'], inplace=True)
combined_df_arr.drop(columns=['Actual Arrival DateTime'], inplace=True)
combined_df_arr.drop(columns=['Scheduled DateTime'], inplace=True)

In [30]:
# df.head(10) test

# df.dtypes

# date is a datetime object, so we need to convert the date to a string, then
# concatonate with arrival time and then reconvert back into datetime (which always needs a date before an actual time)

combined_df_dep['Date'] = combined_df_dep['Date'].astype(str)

combined_df_dep['Scheduled DateTime'] = pd.to_datetime(combined_df_dep['Date'] + ' ' + combined_df_dep['Scheduled Time'])
combined_df_dep['Actual Departure DateTime'] = pd.to_datetime(combined_df_dep['Date'] + ' ' + combined_df_dep['Actual Departure Time'])

# Then we can add it back to the dataframe as a new column:

combined_df_dep['Time Difference'] = combined_df_dep['Actual Departure DateTime'] - combined_df_dep['Scheduled DateTime']

In [31]:
#convert the time difference to actual minutes:

combined_df_dep['Time Difference (Minutes)'] = (combined_df_dep['Time Difference'].dt.total_seconds() / 60)
combined_df_dep['Time Difference (Minutes)'] = combined_df_dep['Time Difference (Minutes)'].fillna(0).round().astype(int)

In [32]:
combined_df_dep.drop(columns=['Time Difference'], inplace=True)
combined_df_dep.drop(columns=['Actual Departure DateTime'], inplace=True)
combined_df_dep.drop(columns=['Scheduled DateTime'], inplace=True)

In [None]:
combined_df_arr['Actual Arrival Time'].fillna(combined_df_arr['Scheduled Time'], inplace=True)

In [None]:
combined_df_dep['Actual Departure Time'].fillna(combined_df_dep['Scheduled Time'], inplace=True)

---

### Adding daily temperature and wind speed

In [None]:
data = {
    'Date': ['2024-06-16', '2024-06-17', '2024-06-24', '2024-06-25', '2024-06-26', '2024-06-27', '2024-06-28', '2024-06-29', '2024-07-01', '2024-07-02', '2024-07-03', '2024-07-06', '2024-07-07', '2024-07-08', '2024-07-09', '2024-07-13', '2024-07-14', '2024-07-15', '2024-07-16', '2024-07-17'],
    'Temperature': [23.8, 22.2, 22.2, 25, 28.8, 27.2, 23.8, 27.2, 17.7, 15, 16, 27.7, 21.1, 25, 28.8, 18.8, 22.7, 27, 25, 22.2],
    'Wind Speed': [10.5, 9.8, 5.6, 9.4, 9.2, 7.4, 9.8, 6.5, 6.5, 11, 8.9, 12.5, 10.1, 8.7, 8.1, 10.3, 7.6, 7.8, 8.7, 11]
}

weather_df = pd.DataFrame(data)

weather_df

In [None]:
combined_df_arr = pd.merge(combined_df_arr, weather_df, on='Date', how='left')

In [None]:
combined_df_dep = pd.merge(combined_df_dep, weather_df, on='Date', how='left')

---

### Adding airline category: budget vs premium

In [53]:
airline_filter = pd.read_csv('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/airlines_list.csv')

In [None]:
combined_df_dep = pd.merge(combined_df_dep, airline_filter, on='Airline', how='left')

In [None]:
combined_df_arr = pd.merge(combined_df_arr, airline_filter, on='Airline', how='left')

---

### Adding country list

#### Departures

In [None]:
data = {
    "Departure": [
        "Izmir ADB", "Beijing", "New York EWR", "Chisinau", "Doha", "Trieste", "Geneva", "New York JFK", 
        "Marseille", "Frankfurt", "Munich", "Rome FCO", "Belgrade", "Vienna", "Amsterdam", "Naples", 
        "Antalya", "Paris CDG", "Palma de Mallorca", "Zurich", "Oslo", "Milan LIN", "Istanbul IST", 
        "Copenhagen", "Tel Aviv", "Venice", "Tallinn", "Dublin", "Zagreb", "Barcelona", "Dalaman", 
        "Bolzano", "London STN", "Manchester", "Madrid", "Luxembourg", "Stockholm ARN", "London LHR", 
        "Porto", "Warsaw", "Baku", "Singapore", "Athens", "Riga", "Brussels", "Dubrovnik", "Toulouse", 
        "Istanbul SAW", "London LGW", "Reykjavik", "Thessaloniki", "Paris Orly", "Pristina", "Catania", 
        "Sofia", "Zadar", "Lisbon", "Helsinki", "Valencia", "Ibiza", "Vilnius", "Monastir", "Malaga", 
        "Tirana", "Birmingham", "Castellón de la Plana", "Bergen BGO", "Cairo", "Budapest", "Stuttgart", 
        "Milan BGY", "Dusseldorf", "London LCY", "Cologne/Bonn", "Basel", "Gothenburg GOT", "Chania", 
        "Nice", "Funchal", "Graz", "Milan MXP", "Glasgow", "Rhodos", "Corfu", "Olbia", "Ankara ESB", 
        "Kos", "Saarbrücken", "Bordeaux", "Kaunas", "Skopje", "Salzburg", "Reggio Calabria", "Malta", 
        "Edinburgh", "Bologna", "Bristol", "Lyon", "Podgorica", "Split", "Heraklion", "London LTN", 
        "Krakow", "Hurghada", "Tbilisi", "Strasbourg", "Marsa Alam", "Kutaisi", "Paphos", "Tunis", 
        "Erbil", "Marrakech", "Bari", "Nantes", "Alicante", "Trondheim", "Pisa", "Nottingham EMA", 
        "Beirut", "Gaziantep", "Tivat", "Palermo", "Faro", "Bucharest", "Varna", "Tenerife", "Larnaca", 
        "Gran Canaria", "Pula", "Verona", "Iasi", "Bourgas", "Rijeka", "Samsun Carsamba", "Bastia", 
        "Adana", "Preveza", "Fuerteventura", "Bodrum", "Yerevan", "Djerba", "Zakinthos", "Diyarbakir", 
        "Rotterdam", "Kayseri", "Salerno"
    ],
    "Country": [
        "Turkey", "China", "USA", "Moldova", "Qatar", "Italy", "Switzerland", "USA", "France", 
        "Germany", "Germany", "Italy", "Serbia", "Austria", "Netherlands", "Italy", "Turkey", 
        "France", "Spain", "Switzerland", "Norway", "Italy", "Turkey", "Denmark", "Israel", 
        "Italy", "Estonia", "Ireland", "Croatia", "Spain", "Turkey", "Italy", "UK", "UK", "Spain", 
        "Luxembourg", "Sweden", "UK", "Portugal", "Poland", "Azerbaijan", "Singapore", "Greece", 
        "Latvia", "Belgium", "Croatia", "France", "Turkey", "UK", "Iceland", "Greece", "France", 
        "Kosovo", "Italy", "Bulgaria", "Croatia", "Portugal", "Finland", "Spain", "Spain", "Lithuania", 
        "Tunisia", "Spain", "Albania", "UK", "Spain", "Norway", "Egypt", "Hungary", "Germany", 
        "Italy", "Germany", "UK", "Germany", "Switzerland", "Sweden", "Greece", "France", "Portugal", 
        "Austria", "Italy", "UK", "Greece", "Greece", "Italy", "Turkey", "Greece", "Germany", "France", 
        "Lithuania", "North Macedonia", "Austria", "Italy", "Malta", "UK", "Italy", "UK", "France", 
        "Montenegro", "Croatia", "Greece", "UK", "Poland", "Egypt", "Georgia", "France", "Egypt", 
        "Georgia", "Cyprus", "Tunisia", "Iraq", "Morocco", "Italy", "France", "Spain", "Norway", 
        "Italy", "UK", "Lebanon", "Turkey", "Montenegro", "Italy", "Portugal", "Romania", "Bulgaria", 
        "Spain", "Cyprus", "Spain", "Croatia", "Italy", "Romania", "Bulgaria", "Croatia", "Turkey", 
        "France", "Turkey", "Greece", "Spain", "Turkey", "Armenia", "Tunisia", "Greece", "Turkey", 
        "Netherlands", "Turkey", "Italy"
    ]
}

# Create the DataFrame
arr_df = pd.DataFrame(data)
arr_df

#### Arrivals

In [None]:
data = {
    "Destination": [
        'Porto', 'Malaga', 'Ibiza', 'Palma de Mallorca', 'Venice',
        'Catania', 'Tel Aviv', 'Amsterdam', 'Monastir', 'Lisbon',
        'Thessaloniki', 'Valencia', 'Paris CDG', 'Pristina', 'Dubrovnik',
        'Munich', 'Paris Orly', 'Riga', 'Copenhagen', 'Brussels',
        'Istanbul IST', 'Luxembourg', 'Vienna', 'Zurich', 'Madrid',
        'London LHR', 'Funchal', 'Sofia', 'Frankfurt', 'Izmir ADB',
        'Trieste', 'Castellón de la Plana', 'Chisinau', 'Geneva',
        'Marseille', 'Rome FCO', 'Belgrade', 'Naples', 'Antalya', 'Oslo',
        'Doha', 'New York EWR', 'London LGW', 'Chania', 'New York JFK',
        'Dublin', 'Tallinn', 'Barcelona', 'Manchester', 'Rhodos', 'Zagreb',
        'Bolzano', 'London STN', 'Stuttgart', 'Dalaman', 'Warsaw',
        'Athens', 'Baku', 'Singapore', 'Toulouse', 'Corfu', 'Kos',
        'Reykjavik', 'Istanbul SAW', 'Olbia', 'Nice', 'Hurghada',
        'Helsinki', 'Ankara ESB', 'Vilnius', 'Milan MXP', 'Gothenburg GOT',
        'Beijing', 'Milan LIN', 'Tirana', 'Heraklion', 'Birmingham',
        'Bordeaux', 'Cologne/Bonn', 'Bergen BGO', 'Stockholm ARN',
        'Budapest', 'Cairo', 'Dusseldorf', 'Milan BGY', 'London LCY',
        'Bologna', 'Podgorica', 'Basel', 'Split', 'Salzburg', 'Glasgow',
        'Saarbrücken', 'Krakow', 'Tbilisi', 'Kaunas', 'Reggio Calabria',
        'Skopje', 'Graz', 'Edinburgh', 'Malta', 'Lyon', 'Bristol',
        'London LTN', 'Zadar', 'Strasbourg', 'Bari', 'Paphos',
        'Marsa Alam', 'Kutaisi', 'Tunis', 'Marrakech', 'Erbil', 'Beirut',
        'Nantes', 'Alicante', 'Trondheim', 'Pisa', 'Nottingham EMA',
        'Tivat', 'Diyarbakir', 'Bucharest', 'Varna', 'Gran Canaria',
        'Larnaca', 'Palermo', 'Faro', 'Tenerife', 'Pula', 'Bourgas',
        'Verona', 'Rijeka', 'Iasi', 'Samsun Carsamba', 'Bastia', 'Adana',
        'Fuerteventura', 'Preveza', 'Gaziantep', 'Bodrum', 'Yerevan',
        'Zakinthos', 'Djerba', 'Kayseri', 'Rotterdam', 'Berlin', 'Salerno'
    ],
    "Country": [
        'Portugal', 'Spain', 'Spain', 'Spain', 'Italy',
        'Italy', 'Israel', 'Netherlands', 'Tunisia', 'Portugal',
        'Greece', 'Spain', 'France', 'Kosovo', 'Croatia',
        'Germany', 'France', 'Latvia', 'Denmark', 'Belgium',
        'Turkey', 'Luxembourg', 'Austria', 'Switzerland', 'Spain',
        'UK', 'Portugal', 'Bulgaria', 'Germany', 'Turkey',
        'Italy', 'Spain', 'Moldova', 'Switzerland',
        'France', 'Italy', 'Serbia', 'Italy', 'Turkey', 'Norway',
        'Qatar', 'USA', 'UK', 'Greece', 'USA',
        'Ireland', 'Estonia', 'Spain', 'UK', 'Greece', 'Croatia',
        'Italy', 'UK', 'Germany', 'Turkey', 'Poland',
        'Greece', 'Azerbaijan', 'Singapore', 'France', 'Greece',
        'Greece', 'Iceland', 'Turkey', 'Italy', 'France', 'Egypt',
        'Finland', 'Turkey', 'Lithuania', 'Italy', 'Sweden',
        'China', 'Italy', 'Albania', 'Greece', 'UK',
        'France', 'Germany', 'Norway', 'Sweden',
        'Hungary', 'Egypt', 'Germany', 'Italy', 'UK',
        'Italy', 'Montenegro', 'Switzerland', 'Croatia', 'Austria', 'UK',
        'Germany', 'Poland', 'Georgia', 'Lithuania', 'Italy',
        'North Macedonia', 'Austria', 'UK', 'Malta', 'France', 'UK',
        'UK', 'Croatia', 'France', 'Italy', 'Cyprus',
        'Egypt', 'Georgia', 'Tunisia', 'Morocco', 'Iraq', 'Lebanon',
        'France', 'Spain', 'Norway', 'Italy', 'UK',
        'Montenegro', 'Turkey', 'Romania', 'Bulgaria', 'Spain',
        'Cyprus', 'Italy', 'Portugal', 'Spain', 'Croatia',
        'Bulgaria', 'Italy', 'Croatia', 'Romania', 'Turkey', 'France', 
        'Turkey', 'Spain', 'Greece', 'Turkey', 'Turkey', 
        'Armenia', 'Greece', 'Tunisia', 'Turkey', 'Netherlands', 
        'Germany', 'Italy'
    ]
}

# Create DataFrame
dep_df = pd.DataFrame(data)
dep_df

In [None]:
combined_df_arr = pd.merge(combined_df_arr, arr_df, on='Departure', how='left')

In [None]:
combined_df_dep = pd.merge(combined_df_dep, dep_df, on='Destination', how='left')

---

## Export and save

In [17]:
combined_file = 'arrivals.xlsx'
combined_df_arr.to_excel(combined_file, index=False)

In [5]:
combined_file = 'departures.xlsx'
combined_df_dep.to_excel(combined_file, index=False)

---

In [2]:
golden_arr = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/GOLDEN TABLES/arrivals.xlsx')

In [2]:
golden_dep = pd.read_excel('/Users/martynas/Desktop/Ironhack/Week 9:Selenium/Berlin Scraper/GOLDEN TABLES/departures.xlsx')