This script is built with selenium and python and scrapes weather from free meteo website.

In [None]:
import time
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import pandas as pd
import numpy as np
import glob 
from selenium.webdriver.common.action_chains import ActionChains
from joblib import Parallel, delayed

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
start_date = datetime.date(2018, 6, 1)
number_of_days = 1125

date_list = []
for day in range(number_of_days):
    a_date = (start_date + datetime.timedelta(days = day)).isoformat()
    date_list.append(a_date)

datetimes=[]
temperature=[]
wind=[]
pressure=[]
humidity=[]
description=[]
errors=[]

browser = webdriver.Chrome()
# we need to provide a link for the city we want
for x in date_list:
    link=browser.get('https://freemeteo.gr/kairos/athina/istoriko/imerisio-istoriko/?gid=264371&date='+x+'&station=5267&language=greek&country=greece')
    time.sleep(1)
    date=WebDriverWait(browser, 5).until(
            EC.presence_of_element_located((By.XPATH, "//a[@class='cal']"))).text
#     date=browser.find_element_by_xpath("//a[@class='cal']").text

    time_list=browser.find_elements_by_xpath("//td[@class='white no-top']")
    if time_list==[]:
        datetimes.append(date)
        temperature.append("")
        wind.append("")
#         pressure.append("")
        humidity.append("")
#         description.append("")
    else:
        for i in time_list:
            datetimes.append(date.split(',')[1]+" "+ i.text)

        temperature_list=browser.find_elements_by_xpath("//td/b")
        for i in temperature_list:
            temperature.append(i.text)

        wind_pressure_list=browser.find_elements_by_xpath("//td[@class='nw']")
        for i in range(0,len(wind_pressure_list),3):
            wind.append(wind_pressure_list[i].text)
#         for i in range(2,len(wind_pressure_list),3):
#             pressure.append(wind_pressure_list[i].text)

        humidity_desc_list=browser.find_elements_by_xpath("//td[@class='nw']/following-sibling::td")
        for i in range(1,len(humidity_desc_list),6):
            humidity.append(humidity_desc_list[i].text)

#         desc_list=browser.find_elements_by_xpath("//td[@class='tl']")
#         for i in desc_list:
#             description.append(i.text)

        if (len(datetimes)==len(temperature)==len(wind)==len(humidity)):
            pass
        else:
            errors.append(browser.current_url)
            print('error')
    print(x)

In [None]:
df = pd.DataFrame(list(zip(datetimes,temperature,wind,humidity)), 
               columns =['Date', 'Temperature','Wind','Humidity'])

In [None]:
# df.to_excel('weather_data_Athens.xlsx',header=True,index=False)

# Weather Data to Timeseries

After scraping weather data for Athens and Thessaloniki, we can transform them to a timeseries dataframe with numeric data.

In [None]:
data=pd.read_excel('weather_data_Athens.xlsx')
len(data)

In [None]:
data=data.drop_duplicates(subset=['Date'],keep='first')
data

In [None]:
def fix_hum(df_row): # keep only the number from hum
    text=str(df_row)
    return(int(text.split('%')[0]))

In [None]:
def fix_date_first(df_row): # fixes date column format
    text=str(df_row[0])
    text=text.strip()
    text=text.split(' ')
    new=' '.join([i for i in text if i!=''])
    text=new.split(':')[0]
    text=text.replace('Ιανουαρίου','01').replace(' ','-')
    text=text.replace('Φεβρουαρίου','02').replace(' ','-')
    text=text.replace('Μαρτίου','03').replace(' ','-')
    text=text.replace('Απριλίου','04').replace(' ','-')
    text=text.replace('Μαΐου','05').replace(' ','-')
    text=text.replace('Ιουνίου','06').replace(' ','-')
    text=text.replace('Ιουλίου','07').replace(' ','-')
    text=text.replace('Αυγούστου','08').replace(' ','-')
    text=text.replace('Σεπτεμβρίου','09').replace(' ','-')
    text=text.replace('Οκτωβρίου','10').replace(' ','-')
    text=text.replace('Νοεμβρίου','11').replace(' ','-')
    text=text.replace('Δεκεμβρίου','12').replace(' ','-')
    return(text)

data['Date']=data.apply(fix_date_first,axis=1)


def fix_temp(df_row): # convert temperature column to numeric
    text=str(df_row)
    return(float(text.split('°C')[0]))

def fix_wind(df_row): # same for wind
    if df_row=='Νηνεμία':
        return (0)
    else:
        return(float(df_row.split('Bf')[0].split(' ')[-2]))


def fix_date(df_row):
    date_time_str=df_row['Date']
    date_time_obj = datetime.datetime.strptime(date_time_str, '%d-%m-%Y-%H')
    return(date_time_obj)

data['Date']=data.apply(fix_date,axis=1)
data=data.sort_values(by='Date')
data

In [None]:
data['Temperature']=data['Temperature'].apply(fix_temp)
data['Wind']=data['Wind'].apply(fix_wind)
data['Humidity']=data['Humidity'].apply(fix_hum)
data

In [None]:
numeric=data[['Date','Temperature','Wind','Humidity']].groupby('Date').mean() # group data to hourly intervals
numeric['Date'] = numeric.index
numeric=numeric.reset_index(drop=True)
numeric

In [None]:
data=numeric.copy()

In [None]:
data.set_index(pd.DatetimeIndex(data.Date),inplace=True)
data=data.resample('H').pad()
data.Date = data.index.values
data=data.ffill(axis = 0)
data=data.reset_index(drop=True)
data

In [None]:
data.to_excel('weather_timeseries_Athens.xlsx',header=True,index=False)

# Find Missing Dates

This part is used to identify any missing dates there might exist in the weather data we scraped

In [None]:
data=pd.read_excel('weather_data_Athens.xlsx')

In [None]:
def fix_date(df_row):
    text=str(df_row[0])
    text=text.strip()
    text=text.split(' ')
    new=' '.join([i for i in text if i!=''])
    text=new.split(':')[0]
    text=text.replace('Ιανουαρίου','01').replace(' ','-')
    text=text.replace('Φεβρουαρίου','02').replace(' ','-')
    text=text.replace('Μαρτίου','03').replace(' ','-')
    text=text.replace('Απριλίου','04').replace(' ','-')
    text=text.replace('Μαΐου','05').replace(' ','-')
    text=text.replace('Ιουνίου','06').replace(' ','-')
    text=text.replace('Ιουλίου','07').replace(' ','-')
    text=text.replace('Αυγούστου','08').replace(' ','-')
    text=text.replace('Σεπτεμβρίου','09').replace(' ','-')
    text=text.replace('Οκτωβρίου','10').replace(' ','-')
    text=text.replace('Νοεμβρίου','11').replace(' ','-')
    text=text.replace('Δεκεμβρίου','12').replace(' ','-')
    return(text)

data['Date']=data.apply(fix_date,axis=1)

In [None]:
import datetime

def fix_date(df_row):
    date_time_str = df_row
    date_time_obj = datetime.datetime.strptime(df_row, '%d-%m-%Y-%H')
    return(date_time_obj)

data['Date']=data['Date'].apply(fix_date)
data=data.sort_values(by='Date')
data

In [None]:
test=pd.date_range(start = '2018-06-01', end = '2021-06-30',freq='H').difference(numeric.Date).tolist()
len(test)