In [29]:
import requests
from math import sin, cos, sqrt, atan2, radians
import pandas as pd
import numpy as np

In [30]:
def get_distance(lat1, lon1, lat2, lon2):
    
    # approximate radius of earth in km
    R = 6373.0
    Miles_per_Km = 0.621371

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c * Miles_per_Km

    return distance

In [45]:
def find_airports(lat, long, airports, r):
    ret = []
    best = r
    for index, row in airports.iterrows():
        distance = get_distance(lat, long, row['Latitude'], row['Longitude'])
        if distance <= r:
#             print(row)
            data = {'distance':distance,'airport':row['IATA']}
            if distance < best:
                ret.insert(0,data)
                best = distance
            else:
                ret.append(data)
    return ret

In [46]:

def get_events(keyword, my_location):
    remaining_pages = True
    page_number = 5
    page_count = 0
    ret = []
    while remaining_pages:
        URL = f"https://app.ticketmaster.com/discovery/v2/events.json"
        PARAMS = {'size':1,'apikey':'kNkNPU0S8LOzOUrjDWpLto6LEv91lAFM','keyword':keyword, 'page':page_number, 'countryCode':'US'} 
        print(PARAMS)
        r = requests.get(url = URL, params = PARAMS) 
        data = r.json()
        event = data['_embedded']['events'][0]
        event_id = event['id']
        sales = event['sales']['public']
        dates = event['dates']['start']
        prices = event['priceRanges']
        venue = event['_embedded']['venues'][0]
        location = {'address':venue['address']['line1'],'city':venue['city']['name'],'state':venue['state']['stateCode'],
                    'latlong':[float(venue['location']['latitude']),float(venue['location']['longitude'])]}
        page = data['page']
        page_number = page['number']
        total_pages = page['totalPages']
    #     print(f"event_id: {event_id}\nsales:{sales}\ndates:{dates}\nprices:{prices}\nlocation:{location}")
    #     print(f"page:{page}\npage_number:{page_number}\ntotal_pages:{total_pages}")

        event_date = dates['localDate']
        event_time = dates['localTime']
        price_range = [prices[0]['min'],prices[0]['max']]
        distance = get_distance(my_location[0],my_location[1],location['latlong'][0],location['latlong'][1])
        event_line ={'event_date': event_date, 'event_time':event_time, 'price_range':price_range, 'location':location, 'distance':distance}

#         print(event_line)
        ret.append(event_line)

        remaining_pages = (page_number < total_pages) and page_count < 5
        page_number += 1
        page_count += 1

    return ret

    

In [41]:
airports = pd.read_csv("airports.csv").rename(columns=lambda x: x.strip())

In [42]:
us_airports = airports.loc[airports['Country'] == 'United States']

In [43]:
slc_lat_long = [40.790152, -111.979038]
events = get_events('eagles', slc_lat_long)

{'size': 1, 'apikey': 'kNkNPU0S8LOzOUrjDWpLto6LEv91lAFM', 'keyword': 'eagles', 'page': 5, 'countryCode': 'US'}
{'size': 1, 'apikey': 'kNkNPU0S8LOzOUrjDWpLto6LEv91lAFM', 'keyword': 'eagles', 'page': 6, 'countryCode': 'US'}
{'size': 1, 'apikey': 'kNkNPU0S8LOzOUrjDWpLto6LEv91lAFM', 'keyword': 'eagles', 'page': 7, 'countryCode': 'US'}
{'size': 1, 'apikey': 'kNkNPU0S8LOzOUrjDWpLto6LEv91lAFM', 'keyword': 'eagles', 'page': 8, 'countryCode': 'US'}
{'size': 1, 'apikey': 'kNkNPU0S8LOzOUrjDWpLto6LEv91lAFM', 'keyword': 'eagles', 'page': 9, 'countryCode': 'US'}
{'size': 1, 'apikey': 'kNkNPU0S8LOzOUrjDWpLto6LEv91lAFM', 'keyword': 'eagles', 'page': 10, 'countryCode': 'US'}


In [47]:
for event in events:
    latlong = event['location']['latlong']
    airports = find_airports(latlong[0], latlong[1], us_airports, 20)
    event['airports']=airports
    print(event)

{'event_date': '2021-09-18', 'event_time': '20:00:00', 'price_range': [129.0, 750.0], 'location': {'address': '1000 Chopper Circle', 'city': 'Denver', 'state': 'CO', 'latlong': [39.74724, -105.010166]}, 'distance': 374.4053756889388, 'airports': [{'distance': 12.523377317459463, 'airport': 'BJC'}, {'distance': 14.08002503099181, 'airport': 'BFK'}, {'distance': 19.572412819137064, 'airport': 'DEN'}, {'distance': 14.947673242343107, 'airport': 'APA'}]}
{'event_date': '2021-09-16', 'event_time': '20:00:00', 'price_range': [129.0, 750.0], 'location': {'address': '1000 Chopper Circle', 'city': 'Denver', 'state': 'CO', 'latlong': [39.74724, -105.010166]}, 'distance': 374.4053756889388, 'airports': [{'distance': 12.523377317459463, 'airport': 'BJC'}, {'distance': 14.08002503099181, 'airport': 'BFK'}, {'distance': 19.572412819137064, 'airport': 'DEN'}, {'distance': 14.947673242343107, 'airport': 'APA'}]}
{'event_date': '2021-09-21', 'event_time': '20:00:00', 'price_range': [129.0, 750.0], 'loc

In [87]:
import datetime
event = events[0]

origin = "SLC"
destination = event['airports'][0]['airport']
startdate = event['event_date']

date_1 = datetime.datetime.strptime(startdate, "%Y-%m-%d")

enddate = (date_1 + datetime.timedelta(days=1)).strftime("%Y-%m-%d")

url = f"https://www.kayak.com/flights/{origin}-{destination}/{startdate}/{enddate}?sort=bestflight_a&fs=stops=0"

print(url)
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome("./chromedriver")
driver.implicitly_wait(20)
driver.get(url)





https://www.kayak.com/flights/SLC-BOS/2020-03-25/2020-03-26?sort=bestflight_a&fs=stops=0


In [82]:
soup=BeautifulSoup(driver.page_source, 'lxml')

In [86]:
deptimes = soup.find_all('span', attrs={'class': 'depart-time base-time'})
arrtimes = soup.find_all('span', attrs={'class': 'arrival-time base-time'})
meridies = soup.find_all('span', attrs={'class': 'time-meridiem meridiem'})

deptime = []
for div in deptimes:
    deptime.append(div.getText()[:-1])    

arrtime = []
for div in arrtimes:
    arrtime.append(div.getText()[:-1])   

meridiem = []
for div in meridies:
    meridiem.append(div.getText())  

deptime = np.asarray(deptime)
deptime = deptime.reshape(int(len(deptime)/2), 2)

arrtime = np.asarray(arrtime)
arrtime = arrtime.reshape(int(len(arrtime)/2), 2)      

meridiem = np.asarray(meridiem)
meridiem = meridiem.reshape(int(len(meridiem)/4), 4)

#Get the price
regex = re.compile('Common-Booking-MultiBookProvider (.*)multi-row Theme-featured-large(.*)')
price_list = soup.find_all('div', attrs={'class': regex})

price = []
for div in price_list:
    price.append(int(div.getText().split('\n')[3][1:-1]))

df = pd.DataFrame({"origin" : origin,
                   "destination" : destination,
                   "startdate" : startdate,
                   "enddate" : enddate,
                   "price": price,
                   "currency": "USD",
                   "deptime_o": [m+str(n) for m,n in zip(deptime[:,0],meridiem[:,0])],
                   "arrtime_d": [m+str(n) for m,n in zip(arrtime[:,0],meridiem[:,1])],
                   "deptime_d": [m+str(n) for m,n in zip(deptime[:,1],meridiem[:,2])],
                   "arrtime_o": [m+str(n) for m,n in zip(arrtime[:,1],meridiem[:,3])]
                   })
results = pd.concat([results, df], sort=False)

driver.close() #close the browser

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=80.0.3987.122)


In [84]:
df

Unnamed: 0,origin,destination,startdate,enddate,price,currency,deptime_o,arrtime_d,deptime_d,arrtime_o
0,SLC,BOS,2020-03-25,2020-03-26,36,USD,11:58pm,6:40am,7:31pm,11:08pm
1,SLC,BOS,2020-03-25,2020-03-26,37,USD,11:59pm,6:28am,7:31pm,11:08pm
2,SLC,BOS,2020-03-25,2020-03-26,37,USD,11:59pm,6:28am,5:25pm,9:03pm
3,SLC,BOS,2020-03-25,2020-03-26,39,USD,11:58pm,6:40am,5:25pm,9:03pm
4,SLC,BOS,2020-03-25,2020-03-26,41,USD,11:59pm,6:28am,7:00pm,10:40pm
5,SLC,BOS,2020-03-25,2020-03-26,45,USD,5:24pm,11:59pm,6:36am,10:11am
6,SLC,BOS,2020-03-25,2020-03-26,43,USD,11:58pm,6:40am,7:00pm,10:40pm
7,SLC,BOS,2020-03-25,2020-03-26,44,USD,5:24pm,11:59pm,7:31pm,11:08pm
8,SLC,BOS,2020-03-25,2020-03-26,47,USD,9:55am,4:26pm,5:25pm,9:03pm
9,SLC,BOS,2020-03-25,2020-03-26,47,USD,5:24pm,11:59pm,5:25pm,9:03pm


In [26]:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
import time

def scrape(origin, destination, startdate, days, requests):
    
    global results
    
    enddate = datetime.strptime(startdate, '%Y-%m-%d').date() + timedelta(days)
    enddate = enddate.strftime('%Y-%m-%d')

    url = "https://www.kayak.com/flights/" + origin + "-" + destination + "/" + startdate + "/" + enddate + "?sort=bestflight_a"
    print("\n" + url)

    chrome_options = webdriver.ChromeOptions()
    agents = ["Chrome/80.0.3987.106"]
    print("User agent: " + agents[(requests%len(agents))])
    chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    driver = webdriver.Chrome("./chromedriver", options=chrome_options, desired_capabilities=chrome_options.to_capabilities())
    driver.implicitly_wait(20)
    driver.get(url)

    #Check if Kayak thinks that we're a bot
    time.sleep(5) 
    soup=BeautifulSoup(driver.page_source, 'lxml')

    if soup.find_all('p')[0].getText() == "Please confirm that you are a real KAYAK user.":
        print("Kayak thinks I'm a bot, which I am ... so let's wait a bit and try again")
        driver.close()
        time.sleep(20)
        return "failure"

    time.sleep(20) #wait 20sec for the page to load
    
    soup=BeautifulSoup(driver.page_source, 'lxml')
    
    #get the arrival and departure times
    deptimes = soup.find_all('span', attrs={'class': 'depart-time base-time'})
    arrtimes = soup.find_all('span', attrs={'class': 'arrival-time base-time'})
    meridies = soup.find_all('span', attrs={'class': 'time-meridiem meridiem'})
    
    deptime = []
    for div in deptimes:
        deptime.append(div.getText()[:-1])    
        
    arrtime = []
    for div in arrtimes:
        arrtime.append(div.getText()[:-1])   

    meridiem = []
    for div in meridies:
        meridiem.append(div.getText())  
        
    deptime = np.asarray(deptime)
    deptime = deptime.reshape(int(len(deptime)/2), 2)
    
    arrtime = np.asarray(arrtime)
    arrtime = arrtime.reshape(int(len(arrtime)/2), 2)      
    
    meridiem = np.asarray(meridiem)
    meridiem = meridiem.reshape(int(len(meridiem)/4), 4)
        
    #Get the price
    regex = re.compile('Common-Booking-MultiBookProvider (.*)multi-row Theme-featured-large(.*)')
    price_list = soup.find_all('div', attrs={'class': regex})
    
    price = []
    for div in price_list:
        price.append(int(div.getText().split('\n')[3][1:-1]))

    df = pd.DataFrame({"origin" : origin,
                       "destination" : destination,
                       "startdate" : startdate,
                       "enddate" : enddate,
                       "price": price,
                       "currency": "USD",
                       "deptime_o": [m+str(n) for m,n in zip(deptime[:,0],meridiem[:,0])],
                       "arrtime_d": [m+str(n) for m,n in zip(arrtime[:,0],meridiem[:,1])],
                       "deptime_d": [m+str(n) for m,n in zip(deptime[:,1],meridiem[:,2])],
                       "arrtime_o": [m+str(n) for m,n in zip(arrtime[:,1],meridiem[:,3])]
                       })

    results = pd.concat([results, df], sort=False)

    driver.close() #close the browser

    time.sleep(15) #wait 15sec until the next request
    
    return "success"

#Create an empty dataframe 
results = pd.DataFrame(columns=['origin','destination','startdate','enddate','deptime_o','arrtime_d','deptime_d','arrtime_o','currency','price'])

requests = 0 

destinations = ['SDF']
startdates = ['2020-03-26','2020-03-27','2020-03-28']

for destination in destinations:
    for startdate in startdates:   
        requests = requests + 1
        while scrape('SLC', destination, startdate, 3, requests) != "success":
            requests = requests + 1
            
#Find the minimum price for each destination-startdate-combination
results_agg = results.groupby(['destination','startdate'])['price'].min().reset_index().rename(columns={'min':'price'})     


https://www.kayak.com/flights/SLC-SDF/2020-03-26/2020-03-29?sort=bestflight_a
User agent: Chrome/80.0.3987.106

https://www.kayak.com/flights/SLC-SDF/2020-03-27/2020-03-30?sort=bestflight_a
User agent: Chrome/80.0.3987.106


KeyboardInterrupt: 

In [None]:
https://www.kayak.com/flights/SLC-SDF/2020-03-27/2020-04-03?sort=bestflight_a
https://www.kayak.com/flights/SLC-SDF/2020-03-26/2020-03-29?sort=bestflight_a&fs=stops=0