# Collect Data About Heng Chun Weather

A web crawler to collect data from CWB 

collect Heng Chun weather

https://www.cwb.gov.tw/V8/E/W/OBS_Station.html?ID=46759

In [1]:
# importing modules

from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
# Collect data
# Analysis CSS
url = "https://www.cwb.gov.tw/V8/E/W/Observe/MOD/24hr/46759.html?T=698"
html = urlopen(url).read()
soup = BeautifulSoup(html, "lxml")
print(soup.prettify())#show html


<html>
 <body>
  <tr data-countyid="10013" data-cstname="恆春" data-estname="Hengchun">
   <th class="is_show" headers="time" scope="row">
    05/12
    <br class="visible-md"/>
    14:20
   </th>
   <td class="is_show" headers="temp">
    <span class="tem-C is-active">
     32.2
    </span>
    <span class="tem-F is-hidden">
     90
    </span>
   </td>
   <td class="is_show" headers="weather">
    <img alt="CLEAR" src="/V8/assets/img/weather_icons/weathers/svg_icon/day/01.svg" title="CLEAR"/>
   </td>
   <td headers="w-1">
    <span class="wind">
     WNW
    </span>
   </td>
   <td headers="w-2">
    <span class="wind_2 is-active">
     2.1
    </span>
    <span class="wind_1 is-hidden">
     2
    </span>
   </td>
   <td headers="w-3">
    <span class="wind_2 is-active">
     -
    </span>
    <span class="wind_1 is-hidden">
     -
    </span>
   </td>
   <td headers="visible-1">
    &gt;30
   </td>
   <td headers="hum">
    55
   </td>
   <td headers="pre">
    1007.8
   </td>
   <t

In [3]:
#check time 
time_log = soup.find_all('th')
data_time = [i.get_text() for i in time_log]
print(data_time[:5])#show head 5 data  

['05/12 14:20', '05/12 14:10', '05/12 14:00', '05/12 13:50', '05/12 13:40']


In [4]:
# lack of time year
# Add year
now_year = str(datetime.now().year) + '/'
data_time_year = [now_year + i for i in data_time]
data = pd.DataFrame({'time':data_time_year})
print(data.head())#show head 5 data  

               time
0  2021/05/12 14:20
1  2021/05/12 14:10
2  2021/05/12 14:00
3  2021/05/12 13:50
4  2021/05/12 13:40


In [5]:
soup_td = soup.find_all('td')
headers_set = set([i['headers'][0] for i in soup_td])
headers_list = list(headers_set)
headers_list.sort()
print(headers_list)#show headers


['hum', 'pre', 'rain', 'sunlight', 'temp', 'visible-1', 'w-1', 'w-2', 'w-3', 'weather']


In [6]:
for i in headers_list:
    data_value = soup.find_all('td', headers=i)
    data[i] = [j.get_text() for j in data_value]

print(data.head())#show data head

               time hum     pre rain sunlight    temp visible-1  w-1   w-2  \
0  2021/05/12 14:20  55  1007.8  0.0      6.5  32.290       >30  WNW  2.12   
1  2021/05/12 14:10  55  1007.9  0.0      6.4  32.490       >30    W  2.52   
2  2021/05/12 14:00  54  1007.9  0.0      6.2  32.691     21-30    W  2.32   
3  2021/05/12 13:50  52  1007.9  0.0      6.0  33.292       >30    W  3.32   
4  2021/05/12 13:40  52  1008.1  0.0      5.9  32.891       >30    W  2.42   

    w-3 weather  
0    --          
1    --          
2  7.84          
3    --          
4  7.84          


In [7]:
# lose weather data
# look weather data
weather_data = soup.find_all('td', headers='weather')
print(weather_data[0])#analysis weather

<td class="is_show" headers="weather"><img alt="CLEAR" src="/V8/assets/img/weather_icons/weathers/svg_icon/day/01.svg" title="CLEAR"/></td>


In [8]:
# The target is img alt=?.
data['weather'] = [i.img['alt'] for i in weather_data]
print(data)

                 time hum     pre rain sunlight    temp visible-1  w-1   w-2  \
0    2021/05/12 14:20  55  1007.8  0.0      6.5  32.290       >30  WNW  2.12   
1    2021/05/12 14:10  55  1007.9  0.0      6.4  32.490       >30    W  2.52   
2    2021/05/12 14:00  54  1007.9  0.0      6.2  32.691     21-30    W  2.32   
3    2021/05/12 13:50  52  1007.9  0.0      6.0  33.292       >30    W  3.32   
4    2021/05/12 13:40  52  1008.1  0.0      5.9  32.891       >30    W  2.42   
..                ...  ..     ...  ...      ...     ...       ...  ...   ...   
145  2021/05/11 14:10  67  1008.3  0.0      5.6  31.789       >30    W  2.02   
146  2021/05/11 14:00  66  1008.4  0.0      5.5  32.390     21-30   SW  2.22   
147  2021/05/11 13:50  68  1008.5  0.0      5.4  31.589       >30  WSW  2.02   
148  2021/05/11 13:40  64  1008.6  0.0      5.3  31.789       >30  WSW  2.42   
149  2021/05/11 13:30  65  1008.9  0.0      5.1  31.989       >30  WSW  2.22   

      w-3 weather  
0      --   CLEAR  

In [9]:
data.to_csv('data\\CWB_Heng_Chun .csv', encoding='utf-8-sig')