# Crawl with API

In [None]:
https://api.weather.com/v1/location/VVNB:9:VN/observations/historical.json?apiKey=e1f10a1e78da46f5b10a1e78da96f525&units=m&startDate=20210219
APIKey = "e1f10a1e78da46f5b10a1e78da96f525"

In [None]:
%%writefile DateTimeHandlerInterface.py
class DateTimeHandlerInterface:
    def convert_timestampe(self, timestamp)->str:
        pass

    def get_num_days_of_year(self,year)->int:
        pass

    def get_date_from_day_num(self,year,day_num,str_format)->str:
        pass
    

Overwriting DateTimeHandlerInterface.py


In [None]:
%%writefile VietNamDateTimeHandler.py
from DateTimeHandlerInterface import DateTimeHandlerInterface
import datetime
from datetime import timezone, timedelta
class VietNamDateTimeHandler(DateTimeHandlerInterface):
    def convert_timestampe(self, timestamp)->str:
        return datetime.datetime.fromtimestamp(timestamp,tz=timezone(timedelta(hours=7))).strftime("%Y-%m-%d %H:%M:%S")

    def get_num_days_of_year(self,year)->int:
        return ((datetime.date(year,12,31)-datetime.date(year,1,1)).days+1)

    def get_date_from_day_num(self,year,day_num,str_format)->str:
        return datetime.datetime.strptime(str(year) + "-" + str(day_num), "%Y-%j").strftime(str_format)     
    

Overwriting VietNamDateTimeHandler.py


In [None]:
%%writefile AbstractHourlyWeatherCrawler.py
from abc import ABC, abstractmethod
import datetime, calendar
from VietNamDateTimeHandler import VietNamDateTimeHandler
class AbstractHourlyWeatherCrawler(ABC):
    def __init__(self,year):
        self.year=year
        self.APIKey = "e1f10a1e78da46f5b10a1e78da96f525"
        self.base_url = "https://api.weather.com/v1/location/{location}/observations/historical.json?apiKey={apiKey}&units=m&startDate={date}"\
        .format(location="{location}",apiKey=self.APIKey,date="{date}")
        self.year_data={}
        self.date_time_handler = VietNamDateTimeHandler()

    def get_year_data(self)->list:
        location_url = self.get_location_url()
        if not self.year_data:
            self.year_data = []
            n_days = self.date_time_handler.get_num_days_of_year(self.year)
            for i in range(1,n_days+1):
                # str_date = self.get_date_from_day_num(i,"%Y%m%d")
                # str_key_date = self.get_date_from_day_num(i,"%Y-%m-%d")
                self.year_data.extend(self.get_hourly_data(location_url,i))
        return self.year_data

    @abstractmethod
    def get_location_url(self)->str:
        pass

    @abstractmethod
    def get_hourly_data(self,location_url,day)->list:
        pass

Overwriting AbstractHourlyWeatherCrawler.py


In [None]:
%%writefile NoiBaiHourlyWeatherCrawler.py
from AbstractHourlyWeatherCrawler import AbstractHourlyWeatherCrawler
from bs4 import BeautifulSoup
import requests
import json
class NoiBaiHourlyWeatherCrawler(AbstractHourlyWeatherCrawler):

    def __init__(self,year):
        super().__init__(year)
        self.time_range=self.get_time_observation_range()

    def get_time_observation_range(self)->list:
        time_range=[]
        mins = ["00","30"]
        for i in range(24):
            for min in mins:
                time_range.append(f"{i:02d}:{min:s}:00")
        return time_range

    def index_time(self,day,time_range)->dict:
        time_idx ={}
        for idx, _time in enumerate(time_range):
            time_idx[f"{day:s} {_time:s}"]=idx
        return time_idx
     
    def get_location_url(self)->str:
        return self.base_url.format(location="VVNB:9:VN",date="{date}")

    def init_observation(self,time_idx):
        len_observation = len(time_idx)
        observation = [{"time":None, "temp":None,"humidity":None} for i in range(len_observation)]
        for _time, idx in time_idx.items():
            observation[idx]["time"] = _time
        return observation
    
    def get_hourly_data(self,location_url,day_num)->list:
        str_key_date = self.date_time_handler.get_date_from_day_num(self.year,day_num,"%Y-%m-%d")
        
        api_day = self.date_time_handler.get_date_from_day_num(self.year,day_num,"%Y%m%d")
        api_url = location_url.format(date=api_day)
        web = requests.get(api_url)
        soup = BeautifulSoup(web.content, "html.parser")
        json_soup = json.loads(soup.get_text())

        time_idx = self.index_time(str_key_date,self.time_range)
        observations=self.init_observation(time_idx)
        try :
            for ele in json_soup["observations"]:
                date_time = self.date_time_handler.convert_timestampe(ele["valid_time_gmt"])
                try :
                    observations[time_idx[date_time]]["temp"] = ele["temp"]
                    observations[time_idx[date_time]]["humidity"] = ele["rh"]
                except:
                    print(f"DateTime {date_time:s} is out of scope")
        except :
            print(f"Observations Error in day: {str_key_date:s}")
        return observations

Overwriting NoiBaiHourlyWeatherCrawler.py


In [None]:
%%writefile main.py
from NoiBaiHourlyWeatherCrawler import NoiBaiHourlyWeatherCrawler
import pandas as pd
import argparse

def save_to_file(weather_data:list,filename:str):
    _filename = f"data/{filename:s}"
    df = pd.DataFrame.from_dict(weather_data)
    df.to_csv(_filename,mode="a",header=False)


if __name__=="__main__":
    # create parser
    print("Parsing Args")
    parser = argparse.ArgumentParser()
    parser.add_argument("year")
    parser.add_argument("filename")
    args = parser.parse_args()
 
    print("Start crawling weather data in {year}".format(year=args.year))
    year = int(args.year)
    crawler = NoiBaiHourlyWeatherCrawler(year) 
    weather_data = crawler.get_year_data()
    print("Done Crawling.\nStart Saving ",args.filename)
    save_to_file(weather_data,args.filename)
    print("Done Saving")
    

Overwriting main.py


In [None]:
# !mkdir data
# !python3 main.py 2005 weather2005.csv
# with open("data/weather.csv","w") as f:
#     f.writelines("time,temp,humidity\n")

In [None]:
with open("data/weather.csv","w") as f:
    f.writelines("time,temperature,humidity\n")

In [None]:
%%bash
for i in {2007..2021}
do
   python3 main.py $i weather0721.csv
done

Parsing Args
Start crawling weather data in 2007
DateTime 2007-01-08 15:10:00 is out of scope
DateTime 2007-01-24 16:29:00 is out of scope
DateTime 2007-02-14 09:21:00 is out of scope
DateTime 2007-02-21 14:01:00 is out of scope
DateTime 2007-02-21 15:40:00 is out of scope
DateTime 2007-02-21 16:09:00 is out of scope
DateTime 2007-02-28 13:12:00 is out of scope
DateTime 2007-06-10 23:59:00 is out of scope
DateTime 2007-07-14 15:31:00 is out of scope
DateTime 2007-08-05 03:14:00 is out of scope
DateTime 2007-08-07 14:00:00 is out of scope
DateTime 2007-10-01 17:59:00 is out of scope
Done Crawling.
Start Saving  weather0721.csv
Done Saving
Parsing Args
Start crawling weather data in 2008
DateTime 2008-02-04 14:02:00 is out of scope
DateTime 2008-03-30 08:05:00 is out of scope
DateTime 2008-07-17 20:50:00 is out of scope
Done Crawling.
Start Saving  weather0721.csv
Done Saving
Parsing Args
Start crawling weather data in 2009
DateTime 2009-03-05 13:17:00 is out of scope
Done Crawling.
Star

In [None]:
%cd /content/drive/MyDrive/AI/TemperatureForecast/
!ls data
import pandas as pd
df = pd.read_csv("data/temp_weather0721.csv")
df.head(10)

/content/drive/MyDrive/AI/TemperatureForecast
temp_weather1721.csv  weather0721.csv  weather.csv


Unnamed: 0.1,Unnamed: 0,time,temp,humidity
0,0,2007-01-01 00:00:00,18.0,88.0
1,1,2007-01-01 00:30:00,18.0,83.0
2,2,2007-01-01 01:00:00,18.0,88.0
3,3,2007-01-01 01:30:00,18.0,88.0
4,4,2007-01-01 02:00:00,18.0,88.0
5,5,2007-01-01 02:30:00,18.0,88.0
6,6,2007-01-01 03:00:00,18.0,88.0
7,7,2007-01-01 03:30:00,18.0,88.0
8,8,2007-01-01 04:00:00,18.0,83.0
9,9,2007-01-01 04:30:00,18.0,83.0


In [None]:
import numpy as np
copied_day = "2015-10-29 "
days= ["2015-10-30 ","2015-10-31 ","2015-11-01 ","2015-11-02 "]
time_range=[]
mins = ["00","30"]
for i in range(24):
    for min in mins :
        time_range.append(f"{i:02d}:{min:s}:00")

for tr in time_range:
    for day in days:
        time_to_fill = day+tr
        time_for_fill = copied_day+tr
        print(f"Copying {time_for_fill:s} to {time_to_fill:s}")
        idx = np.flatnonzero(df["time"]==time_to_fill)[0]
        idx2= np.flatnonzero(df["time"]==time_for_fill)[0]
        temp = df.iloc[idx2]["temp"]
        humidity = df.iloc[idx2]["humidity"]
        print(f"At {time_for_fill:s}: temp = {temp:f}, hum = {humidity:f}")
        df.loc[df.time==time_to_fill,"temp"]=temp
        df.loc[df.time==time_to_fill,"humidity"]=humidity
        print(f"Check after copy. Time {time_to_fill:s}: ",end="")
        print(df.loc[df.time==time_to_fill,"temp"])

print("===============================================")
print("After filled")
print(df.loc[df["time"]=="2015-10-29 17:30:00"]["temp"])
print(df.loc[df["time"]=="2015-10-30 17:30:00"]["temp"])
print(df.loc[df["time"]=="2015-11-01 17:30:00"]["temp"])
print(df.loc[df["time"]=="2015-11-02 17:30:00"]["temp"])

Copying 2015-10-29 00:00:00 to 2015-10-30 00:00:00
At 2015-10-29 00:00:00: temp = 26.000000, hum = 83.000000
Check after copy. Time 2015-10-30 00:00:00: 14496    26.0
Name: temp, dtype: float64
Copying 2015-10-29 00:00:00 to 2015-10-31 00:00:00
At 2015-10-29 00:00:00: temp = 26.000000, hum = 83.000000
Check after copy. Time 2015-10-31 00:00:00: 14544    26.0
Name: temp, dtype: float64
Copying 2015-10-29 00:00:00 to 2015-11-01 00:00:00
At 2015-10-29 00:00:00: temp = 26.000000, hum = 83.000000
Check after copy. Time 2015-11-01 00:00:00: 14592    26.0
Name: temp, dtype: float64
Copying 2015-10-29 00:00:00 to 2015-11-02 00:00:00
At 2015-10-29 00:00:00: temp = 26.000000, hum = 83.000000
Check after copy. Time 2015-11-02 00:00:00: 14640    26.0
Name: temp, dtype: float64
Copying 2015-10-29 00:30:00 to 2015-10-30 00:30:00
At 2015-10-29 00:30:00: temp = 26.000000, hum = 83.000000
Check after copy. Time 2015-10-30 00:30:00: 14497    26.0
Name: temp, dtype: float64
Copying 2015-10-29 00:30:00 to

In [None]:
df.count()

Unnamed: 0    262992
time          262992
temp          259881
humidity      258941
dtype: int64

In [None]:
df = df.fillna(method='ffill')

# temperature at "2008-02-10 10:00:00" was high unusually, so we replace it by temperature in the previous hour
df.loc[df.time=="2008-02-10 10:00:00","temp"]=11

In [None]:
df["temperature"].isnull().values.any()
why_nan = df[df.isna().any(axis=1)]
print(why_nan.count())
why_nan.head(10)

In [None]:
df.to_csv("data/filled_weather0721.csv")