# Collect Hengchun Weather Data

Data Sourece: https://www.cwb.gov.tw/V8/E/W/OBS_Station.html?ID=46759

In [1]:
# importing modules
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import pandas as pd
from datetime import datetime

# Collect Data

In [2]:
# Analysis CSS
url = "https://www.cwb.gov.tw/V8/E/W/Observe/MOD/24hr/46759.html?T=698"
html = urlopen(url).read()
soup = BeautifulSoup(html, "lxml")

# Add Year

In [3]:
#check time 
time_log = soup.find_all('th')
data_time = [i.get_text() for i in time_log]
print(data_time[:5])#show head 5 data  

['10/04 14:10', '10/04 14:00', '10/04 13:50', '10/04 13:40', '10/04 13:30']


Lack of year.

In [4]:
# Add year
now_year = str(datetime.now().year) + '/'
data_time_year = [now_year + i for i in data_time]
data = pd.DataFrame({'time':data_time_year})
data['time'] = pd.to_datetime(data['time'])
print(data.head())#show head 5 data  

                 time
0 2021-10-04 14:10:00
1 2021-10-04 14:00:00
2 2021-10-04 13:50:00
3 2021-10-04 13:40:00
4 2021-10-04 13:30:00


In [5]:
soup_td = soup.find_all('td')
headers_set = set([i['headers'][0] for i in soup_td])
headers_list = list(headers_set)
headers_list.sort()
print(headers_list)#show headers


['hum', 'pre', 'rain', 'sunlight', 'temp', 'visible-1', 'w-1', 'w-2', 'w-3', 'weather']


In [6]:
for i in headers_list:
    data_value = soup.find_all('td', headers=i)
    data[i] = [j.get_text() for j in data_value]

print(data.head())#show data head

                 time hum     pre rain sunlight    temp visible-1  w-1   w-2  \
0 2021-10-04 14:10:00  65  1011.7  0.0      6.4  30.587       >30   NE  4.83   
1 2021-10-04 14:00:00  65  1012.0  0.0      6.3  30.787     21-30   NE  4.33   
2 2021-10-04 13:50:00  65  1012.2  0.0      6.3  30.687       >30   NE  3.53   
3 2021-10-04 13:40:00  66  1012.5  0.0      6.3  30.387       >30   NE  4.13   
4 2021-10-04 13:30:00  63  1012.8  0.0      6.3  30.487       >30  ENE  4.73   

     w-3 weather  
0  10.15          
1  12.06          
2  12.06          
3  11.56          
4  11.56          


# Change Weather Image to Text

In [7]:
# look weather image data
weather_data = soup.find_all('td', headers='weather')
print(weather_data[0])#analysis weather

<td class="is_show" headers="weather"><img alt="CLOUDY" src="/V8/assets/img/weather_icons/weathers/svg_icon/day/07.svg" title="CLOUDY"/></td>


In [8]:
# The target is img alt=?.
data['weather'] = [i.img['alt'] for i in weather_data]
print(data.tail())

                   time hum     pre rain sunlight    temp visible-1  w-1  \
145 2021-10-03 14:00:00  64  1012.2  0.0      7.7  30.988     21-30    E   
146 2021-10-03 13:50:00  61  1012.3  0.0      7.6  31.388       >30    E   
147 2021-10-03 13:40:00  60  1012.4  0.0      7.4  31.088       >30    E   
148 2021-10-03 13:30:00  61  1012.5  0.0      7.2  31.489       >30  ENE   
149 2021-10-03 13:20:00  56  1012.5  0.0      7.1  31.388       >30  ENE   

      w-2    w-3 weather  
145  4.83  11.86   CLEAR  
146  4.23  10.65   CLEAR  
147  4.53   9.75   CLEAR  
148  3.93   9.75   CLEAR  
149  3.93   9.75   CLEAR  


# Save Data

In [9]:
df = pd.read_csv('data\CWB_Hengchun.csv', encoding='utf-8-sig', index_col=0 ,parse_dates=['time'])
new_df = df.append(data, ignore_index=True)
new_df.sort_values(by=['time'], inplace=True, ascending=True, key=pd.to_datetime)
new_df.drop_duplicates(inplace=True)
print(new_df.tail())
new_df.to_csv('data\CWB_Hengchun.csv', encoding='utf-8-sig')

                    time hum     pre rain sunlight    temp visible-1  w-1  \
1323 2021-10-04 13:30:00  63  1012.8  0.0      6.3  30.487       >30  ENE   
1353 2021-10-04 13:40:00  66  1012.5  0.0      6.3  30.387       >30   NE   
1352 2021-10-04 13:50:00  65  1012.2  0.0      6.3  30.687       >30   NE   
1346 2021-10-04 14:00:00  65  1012.0  0.0      6.3  30.787     21-30   NE   
1348 2021-10-04 14:10:00  65  1011.7  0.0      6.4  30.587       >30   NE   

       w-2    w-3        weather  
1323  4.73  11.56  PARTLY CLOUDY  
1353  4.13  11.56         CLOUDY  
1352  3.53  12.06         CLOUDY  
1346  4.33  12.06  PARTLY CLOUDY  
1348  4.83  10.15         CLOUDY  
