# Flight Finder

In [1]:
import smtplib
import pandas as pd
from time import sleep
from datetime import datetime, timedelta
from random import randint
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [2]:
chrome_driver_path = 'chromedriver.exe'
kayak = "https://www.kayak.com/explore/{}-anywhere/"
airports = ['RIX', 'TBS'] # Search using an IATA airport code
from_date, to_date = datetime(2022, 10, 1), datetime(2022, 10, 24)
duration_min, duration_max = 4, 8 # In days, inclusive

In [3]:
class FlightLoader:
    def __init__(self, url, airport=None, load_n=3, range=3):
        self.TIME = 0.5
        self.url = url
        self.airport = airport
        self.load_n = load_n
        self.range = range
    
    def increase_time(self):
        self.TIME += 0.25
    
    def press_button(self, button, n_presses=1):
        i = 0
        while i < n_presses:
            try:
                self.driver.find_element(By.XPATH, button).click()
                i += 1
                sleep(0.1)
            except Exception:
                self.driver.find_element(By.TAG_NAME, "Body").send_keys(Keys.CONTROL + 'r')
                sleep(0.5)
    
    def load_more_destinations(self):
        self.press_button('//*[substring(@id, string-length(@id) - string-length("-showMoreButton") +1) = "-showMoreButton"]', n_presses=self.load_n)
        
    def zoom_out(self):
        self.driver.find_element(By.TAG_NAME, "Body").send_keys(Keys.CONTROL + Keys.HOME)
        self.press_button('//*[substring(@id, string-length(@id) - string-length("-zoomControl-minusButton") +1) = "-zoomControl-minusButton"]')
    
    def add_flights(self):
        self.load_more_destinations()
        sleep(self.TIME)
        return [x.text for x in self.driver.find_elements(By.XPATH, '//*[@class = "_iae _lc _ss"]')]
        
    def to_pandas(self, flight_list):
        flights = pd.DataFrame(flight_list, columns=['City', 'Price', 'Country', 'Date']).loc[:, ['Country', 'City', 'Price', 'Date']]
        flights['Price'] = flights['Price'].map(lambda x: int(x[6:]))
        flights[['Departure date', 'Arrival date']] = flights['Date'].str.split('-', expand=True)
        flights.drop('Date', axis=1, inplace=True)
        if self.airport is not None: flights.rename({'Price': f"Price from {self.airport}"}, inplace=True, axis=1)
        return flights.drop_duplicates()
    
    def get_flights(self):
        self.driver = webdriver.Chrome(service=Service(chrome_driver_path))
        self.driver.get(self.url)
        
        flights = []
        flights += self.add_flights()
        for _ in range(self.range):
            self.zoom_out()
            flights += self.add_flights()

        return self.to_pandas([x.split('\n') for x in flights if x != ''])

In [4]:
dates = [
    (from_date + timedelta(i), from_date + timedelta(i) + delta)
    for i in range((to_date - from_date).days)
    for delta in [timedelta(i) for i in range(duration_min, duration_max+1)]
    if from_date + timedelta(i) + delta <= to_date
]

In [5]:
all_flights = []
for i, airport in enumerate(airports):
    print(f"Step {i+1}/{len(airports)}")
    flights = []
    for s_date, e_date in tqdm(dates):
        url = kayak.format(airport) + s_date.strftime("%Y%m%d") + ',' + e_date.strftime("%Y%m%d")
        loader = FlightLoader(url, airport)
        while True:
            try: flights.append(loader.get_flights())
            except Exception: loader.increase_time()
            else: break
        del(loader)
        sleep(randint(1, 3))
    all_flights.append(pd.concat(flights).drop_duplicates())

Step 1/2


100%|████████████████████████████████████████████████████████████████████████████████| 90/90 [1:16:41<00:00, 51.13s/it]


Step 2/2


100%|██████████████████████████████████████████████████████████████████████████████| 90/90 [15:15:11<00:00, 610.13s/it]


In [6]:
flight_list = all_flights[0].rename({'Price': f"Price from {airports[0]}"}, axis=1).merge(all_flights[1].rename({'Price': f"Price from {airports[1]}"}, axis=1))
flight_list['Combined price'] = flight_list[f"Price from {airports[0]}"] + flight_list[f"Price from {airports[1]}"]
flight_list = flight_list.sort_values('Combined price').iloc[:, [0, 1, 2, 5, 6, 3, 4]]

In [7]:
flight_list

Unnamed: 0,Country,City,Price from RIX,Price from TBS,Combined price,Departure date,Arrival date
927,Estonia,Tallinn,49,228,277,"Fri, Oct 14","Fri, Oct 21"
863,Poland,Warsaw,23,258,281,"Thu, Oct 13","Thu, Oct 20"
781,Austria,Vienna,44,243,287,"Wed, Oct 12","Tue, Oct 18"
1128,Denmark,Copenhagen,76,232,308,"Mon, Oct 17","Sat, Oct 22"
890,Belgium,Brussels,57,254,311,"Fri, Oct 14","Tue, Oct 18"
...,...,...,...,...,...,...,...
105,Hungary,Budapest,243,375,618,"Sun, Oct 2","Sat, Oct 8"
178,Portugal,Lisbon,167,458,625,"Mon, Oct 3","Mon, Oct 10"
86,Cyprus,Larnaca,198,525,723,"Sun, Oct 2","Fri, Oct 7"
107,Turkey,Diyarbakır,350,418,768,"Sun, Oct 2","Sat, Oct 8"
