In [16]:
import json
import urllib.parse
import pandas as pd

import requests
from bs4 import BeautifulSoup

In [17]:
destinations = {
    'LWO',
    'GVA',
    'LAX',
    'ZAD',
    'SOF',
    'PEK',
    'DEL',
    'ARN',
    'MAD',
    'KUL',
}
dst = pd.DataFrame([{'airport_code': value} for value in destinations])


In [18]:
# Add airport info

column_mapping = {
    'city':2,
    'country':3,
    'airport_code':4,
    'lat':6, 
    'lng':7,
}
revers_mapping = {v:k for k, v in column_mapping.items()}

airports = pd.read_csv('./content/airports.dat', header=None)
airports = airports.rename(columns=revers_mapping)
airports = airports[list(column_mapping.keys())]
       
        

In [19]:
dest = dst.set_index('airport_code').join(airports.set_index('airport_code'), on='airport_code', )
dest.to_csv('out.csv')

In [21]:
# scraper
def get_weather(code, city, country):
    url = "https://weather-averages.co.uk/parse_search_input"
    mapping = {
        'LAX': 'Los Angeles, CA, USA',
        'ZAD': 'Split, Croatia',
        'LWO': 'Kyiv, UKraine'
    }    
    search = f'{city}, {country}'
    if code in mapping:
        search = mapping[code]
   
    payload = urllib.parse.urlencode({'city_search': f'{search}'})
    headers = {
        'Connection': "keep-alive",
        'Pragma': "no-cache",
        'Cache-Control': "no-cache",
        'Origin': "https://weather-averages.co.uk",
        'Upgrade-Insecure-Requests': "1",
        'Content-Type': "application/x-www-form-urlencoded",
        'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
        'Sec-Fetch-Mode': "navigate",
        'Sec-Fetch-User': "?1",
        'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        'Sec-Fetch-Site': "same-origin",
        'Referer': "https://weather-averages.co.uk/compare-climate/warsaw%2C-poland/warsaw%2C-poland",
        'Accept-Encoding': "gzip, deflate, br",
        'Accept-Language': "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7",
        'Cookie': "_ga=GA1.3.1670207595.1568462657; _gid=GA1.3.1235958444.1568462657; _gat=1",
        'cache-control': "no-cache",
        }

    response = requests.request("POST", url, data=payload, headers=headers)
    
    return response.text

In [22]:
temps = {}

def get_valid_weather(code, city, country):
    print(f'scraping: {city}, {country}')
    html = get_weather(code, city, country)
    
    soup = BeautifulSoup(html)
    
    t_body = soup.select_one('tbody')
    print(f"found {soup.select_one('tbody').select_one('th').text}")

    return html, t_body

# scrape weather
for row in dest.iterrows():
    code, row = row
    city, country = row['city'], row['country']
    
    html, t_body = get_valid_weather(code, city, country)
    with open(f'./content_enriched/weather/searches/{code}.json', 'w') as f:
        f.write(json.dumps(html))
    
    airport_temps = [
        float(value.text.replace('°C', '').strip()) for value in t_body.findAll('td')
    ]
    temps[code] = airport_temps

scraping: Madrid, Spain
found Madrid, Spain
scraping: Zadar, Croatia
found Split, Croatia
scraping: Geneva, Switzerland
found Geneva, Switzerland
scraping: Stockholm, Sweden
found Stockholm, Sweden
scraping: Sofia, Bulgaria
found Sofia, Bulgaria
scraping: Beijing, China
found Beijing, China
scraping: Kuala Lumpur, Malaysia
found Kuala Lumpur, Malaysia
scraping: Lvov, Ukraine
found Kyiv, UKraine
scraping: Delhi, India
found New Delhi, India
scraping: Los Angeles, United States
found Los Angeles, CA, USA


In [30]:
pd_temps = pd.DataFrame(temps).transpose().rename(columns={i: f'temp_month_{i}' for i in range(12)})
dest.join(pd_temps).to_csv('out.csv')