# Web-scrapping to get weather information 2019-2020

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta

In [2]:
url_template = 'https://tenki.jp/past/{}/{:02d}/{:02d}/weather/3/23/'

In [3]:
url = url_template.format(2019, 1, 1) #URL to get the data on 2019/1/1
html_doc = requests.get(url).text

In [4]:
soup = BeautifulSoup(html_doc)
#soup.prettify()

In [5]:
table = soup.find(lambda tag: tag.name == 'table' and 'past-live-area-pref-list-entries' in tag
                 ['class'])

In [6]:
df = pd.read_html(str(table), na_values=['---'], header=0)[0]

In [7]:
#to check how is going
df 

Unnamed: 0,地点名,天気,天気.1,最高気温,最低気温,日積算降水量(mm),日の出,日の入
0,長野,,晴のち雪,5.6℃,-5.6℃,0.5mm,06:59,16:42
1,松本,,不明,7.0℃,-7.9℃,0.0mm,06:59,16:44
2,諏訪,,不明,5.1℃,-8.5℃,0.0mm,,
3,飯田,,不明,8.1℃,-7.3℃,0.0mm,,
4,軽井沢,,不明,3.7℃,-10.0℃,0.0mm,,


In [8]:
df.drop(df.shape[0] - 1, inplace =True)
df.columns = ['city', 'weather', 'weather1', 'max_temp', 'min_temp', 'rain', 'sunrise', 
              'sunset']
df['date'] = pd.to_datetime('{}/{}/{}'.format(2019, 1, 1))

In [9]:
df #to check how is going

Unnamed: 0,city,weather,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,長野,,晴のち雪,5.6℃,-5.6℃,0.5mm,06:59,16:42,2019-01-01
1,松本,,不明,7.0℃,-7.9℃,0.0mm,06:59,16:44,2019-01-01
2,諏訪,,不明,5.1℃,-8.5℃,0.0mm,,,2019-01-01
3,飯田,,不明,8.1℃,-7.3℃,0.0mm,,,2019-01-01


In [10]:
#to automation
def get_weather_for_one_day(year, month, day):
    url = url_template.format(year, month, day)
    html_doc = requests.get(url).text
    soup = BeautifulSoup(html_doc)
    table = soup.find(lambda tag: tag.name == 'table' and 'past-live-area-pref-list-entries'
                     in tag ['class'])
    
    try:
        df = pd.read_html(str(table), na_values=['---'], header=0)[0]
    except Exception as e:
        print(e)
        raise ValueError('No data is available for {}/{:02d}/{:02d}'.format(year, month, day))
    df.drop(df.shape[0] - 1, inplace=True)
    df.columns = ['city', 'weather', 'weather1', 'max_temp', 'min_temp', 'rain', 'sunrise', 
              'sunset']
    df['date'] = pd.to_datetime('{}/{}/{}'.format(year, month, day))
    
    return df

In [11]:

#to download  the weather data from a specific date
def get_weather_from_date(year, month, day):
    date = datetime(year, month, day)
    today = datetime.utcnow()
    df = []
    while (today - date).days >= 1:
        try:
            df.append(get_weather_for_one_day(date.year, date.month, date.day))
        except ValueError as e:
            print(e)
        finally:
            date += timedelta (days=1)
    return pd.concat(df)

In [12]:
df = get_weather_from_date(2019, 1, 1)

In [13]:
dfc = df.copy

In [14]:
df

Unnamed: 0,city,weather,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,長野,,晴のち雪,5.6℃,-5.6℃,0.5mm,06:59,16:42,2019-01-01
1,松本,,不明,7.0℃,-7.9℃,0.0mm,06:59,16:44,2019-01-01
2,諏訪,,不明,5.1℃,-8.5℃,0.0mm,,,2019-01-01
3,飯田,,不明,8.1℃,-7.3℃,0.0mm,,,2019-01-01
0,長野,,雪のち晴,3.6℃,-1.1℃,3.0mm,07:00,16:43,2019-01-02
...,...,...,...,...,...,...,...,...,...
3,飯田,,晴一時雨,29.3℃,16.0℃,0.5mm,,,2020-05-11
0,長野,,晴のち曇,29.2℃,10.5℃,0.0mm,04:43,18:45,2020-05-12
1,松本,,晴のち曇,27.9℃,9.4℃,0.0mm,04:44,18:45,2020-05-12
2,諏訪,,晴のち曇,25.8℃,8.5℃,0.0mm,,,2020-05-12


In [15]:
#To clean the data 
def clean_data(df, city_mapping):
    df = df.copy()
    df.city = df.city.map(city_mapping) 
    df.weather1 = df.weather1.map(weather1_mapping) 
    df = df.fillna('Unknown')
    df = df.drop(['weather'], axis = 1) 
    df = df[df.city == 'Matsumoto']
    df = df.reset_index(drop=True)
    start_date = '2019-01-01'
    end_date = '2020-02-29'
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    df = df.loc[mask]
    
    return df


In [16]:
#To translate the main cities in Nagano prefecture
city_mapping = {
    '長野': 'Nagano',
    '松本': 'Matsumoto',
    '諏訪': 'Suwa',
    '飯田': 'Iida'
}

weather1_mapping = {
    '不明': 'unknown', 
    '雪': 'snow', 
    '雪のち雨': 'snow and then rain', 
    '雪一時雨': 'sleet', 
    '雨': 'rain', 
    '晴': 'sunny', 
    '曇のち雨': 'cloudy then rain', 
    '晴のち曇': 'cloudy weather', 
    '曇のち晴': 'cloudy then sunny',
    '晴一時雪': 'fine snow', 
    '雨のち晴': 'rain then sunny', 
    '曇': 'cloudy', 
    '雪のち晴': 'snow then sunny', 
    '雨のち曇': 'cloudy after rain', 
    '曇時々晴': 'partly cloudy', 
    '雪のち曇': 'cloudy after snow', 
    '晴一時雨': 'sunny partly rain',
    '晴のち雨': 'sunny after rain'
}

In [17]:
cleaned_df = clean_data(df, city_mapping)

In [21]:
cleaned_df.head(50)

Unnamed: 0,city,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,Matsumoto,unknown,7.0℃,-7.9℃,0.0mm,06:59,16:44,2019-01-01
1,Matsumoto,unknown,4.4℃,-0.7℃,1.5mm,06:59,16:44,2019-01-02
2,Matsumoto,unknown,4.3℃,-6.0℃,0.0mm,06:59,16:45,2019-01-03
3,Matsumoto,unknown,4.7℃,-8.2℃,0.0mm,07:00,16:46,2019-01-04
4,Matsumoto,unknown,10.4℃,-2.4℃,0.0mm,07:00,16:47,2019-01-05
5,Matsumoto,unknown,1.3℃,-2.7℃,0.0mm,07:00,16:48,2019-01-06
6,Matsumoto,unknown,4.9℃,-7.3℃,0.0mm,07:00,16:49,2019-01-07
7,Matsumoto,unknown,8.8℃,-5.3℃,0.0mm,07:00,16:50,2019-01-08
8,Matsumoto,unknown,2.4℃,-2.2℃,0.0mm,07:00,16:50,2019-01-09
9,Matsumoto,unknown,1.5℃,-8.9℃,0.0mm,07:00,16:51,2019-01-10


In [19]:
cleaned_df.to_csv('weather.csv')

In [20]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 425 entries, 0 to 424
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   city      425 non-null    object        
 1   weather1  425 non-null    object        
 2   max_temp  425 non-null    object        
 3   min_temp  425 non-null    object        
 4   rain      425 non-null    object        
 5   sunrise   425 non-null    object        
 6   sunset    425 non-null    object        
 7   date      425 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(7)
memory usage: 29.9+ KB


In [22]:
cleaned_df.

city          1
weather1     15
max_temp    244
min_temp    227
rain         42
sunrise     151
sunset      158
date        425
dtype: int64

In [23]:
cleaned_df['weather1'].value_counts()

unknown               364
rain                   28
cloudy then sunny       5
snow                    5
cloudy weather          5
sunny                   4
cloudy then rain        3
cloudy                  3
snow and then rain      2
partly cloudy           1
fine snow               1
cloudy after rain       1
rain then sunny         1
snow then sunny         1
sleet                   1
Name: weather1, dtype: int64

In [24]:
cleaned_df['rain'].value_counts()

0.0mm      303
0.5mm       17
1.5mm       12
2.0mm       12
1.0mm        8
2.5mm        6
4.0mm        5
9.5mm        5
3.5mm        4
7.0mm        4
6.5mm        4
11.5mm       4
4.5mm        3
3.0mm        3
6.0mm        3
5.5mm        3
9.0mm        2
19.5mm       2
26.5mm       2
17.5mm       1
15.5mm       1
52.0mm       1
44.0mm       1
51.5mm       1
53.0mm       1
14.5mm       1
28.0mm       1
20.0mm       1
10.5mm       1
134.0mm      1
29.5mm       1
20.5mm       1
25.5mm       1
21.5mm       1
13.5mm       1
12.0mm       1
8.0mm        1
27.5mm       1
36.5mm       1
8.5mm        1
14.0mm       1
16.0mm       1
Name: rain, dtype: int64