# Web-scrapping to get weather information 2019-2020

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta

In [2]:
url_template = 'https://tenki.jp/past/{}/{:02d}/{:02d}/weather/3/23/'

In [3]:
#URL to get the data on 2019/1/1
url = url_template.format(2018, 4, 26) 
html_doc = requests.get(url).text

In [4]:
#soup.prettify()
soup = BeautifulSoup(html_doc)

In [5]:
table = soup.find(lambda tag: tag.name == 'table' and 'past-live-area-pref-list-entries' in tag
                 ['class'])

In [6]:
df = pd.read_html(str(table), na_values=['---'], header=0)[0]

In [7]:
#checking
df 

Unnamed: 0,地点名,天気,天気.1,最高気温,最低気温,日積算降水量(mm),日の出,日の入
0,長野,,雨のち晴,20.8℃,8.1℃,0.0mm,05:00,18:31
1,松本,,不明,21.6℃,8.7℃,0.0mm,05:01,18:31
2,諏訪,,不明,19.4℃,8.1℃,0.0mm,,
3,飯田,,不明,23.5℃,8.8℃,0.0mm,,
4,軽井沢,,不明,17.7℃,4.2℃,0.0mm,,


In [8]:
# renaming columns, assigning format
df.drop(df.shape[0] - 1, inplace =True)
df.columns = ['city', 'weather', 'weather1', 'max_temp', 'min_temp', 'rain', 'sunrise', 
              'sunset']
df['date'] = pd.to_datetime('{}/{}/{}'.format(2018, 4, 26))

In [9]:
#to check how is going
df 

Unnamed: 0,city,weather,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,長野,,雨のち晴,20.8℃,8.1℃,0.0mm,05:00,18:31,2018-04-26
1,松本,,不明,21.6℃,8.7℃,0.0mm,05:01,18:31,2018-04-26
2,諏訪,,不明,19.4℃,8.1℃,0.0mm,,,2018-04-26
3,飯田,,不明,23.5℃,8.8℃,0.0mm,,,2018-04-26


In [10]:
# Automation
def get_weather_for_one_day(year, month, day):
    url = url_template.format(year, month, day)
    html_doc = requests.get(url).text
    soup = BeautifulSoup(html_doc)
    table = soup.find(lambda tag: tag.name == 'table' and 'past-live-area-pref-list-entries'
                     in tag ['class'])
    
    try:
        df = pd.read_html(str(table), na_values=['---'], header=0)[0]
    except Exception as e:
        print(e)
        raise ValueError('No data is available for {}/{:02d}/{:02d}'.format(year, month, day))
    df.drop(df.shape[0] - 1, inplace=True)
    df.columns = ['city', 'weather', 'weather1', 'max_temp', 'min_temp', 'rain', 'sunrise', 
              'sunset']
    df['date'] = pd.to_datetime('{}/{}/{}'.format(year, month, day))
    
    return df

In [11]:
#to download  the weather data from a specific date
def get_weather_from_date(year, month, day):
    date = datetime(year, month, day)
    today = datetime.utcnow()
    df = []
    while (today - date).days >= 1:
        try:
            df.append(get_weather_for_one_day(date.year, date.month, date.day))
        except ValueError as e:
            print(e)
        finally:
            date += timedelta (days=1)
    return pd.concat(df)

In [12]:
# Get all data from 2019 until today
df = get_weather_from_date(2018, 4, 26)

In [13]:
# Security copy
dfc = df.copy

In [14]:
df

Unnamed: 0,city,weather,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,長野,,雨のち晴,20.8℃,8.1℃,0.0mm,05:00,18:31,2018-04-26
1,松本,,不明,21.6℃,8.7℃,0.0mm,05:01,18:31,2018-04-26
2,諏訪,,不明,19.4℃,8.1℃,0.0mm,,,2018-04-26
3,飯田,,不明,23.5℃,8.8℃,0.0mm,,,2018-04-26
0,長野,,曇のち晴,21.2℃,9.6℃,0.0mm,04:59,18:32,2018-04-27
...,...,...,...,...,...,...,...,...,...
3,飯田,,晴,23.2℃,10.7℃,0.0mm,,,2020-05-13
0,長野,,晴,26.1℃,6.3℃,0.0mm,04:41,18:47,2020-05-14
1,松本,,晴,26.2℃,6.5℃,0.0mm,04:43,18:47,2020-05-14
2,諏訪,,晴,23.6℃,5.9℃,0.0mm,,,2020-05-14


In [31]:
# General cleaning data 
def clean_data(df, city_mapping):
    df = df.copy()
    df.city = df.city.map(city_mapping) 
    df.weather1 = df.weather1.map(weather1_mapping) 
    df = df.fillna('Unknown')
    df = df.drop(['weather'], axis = 1) 
    df = df[df.city == 'Matsumoto']
    df = df.reset_index(drop=True)
    start_date = '2018-04-26'
    end_date = '2020-02-29'
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    df = df.loc[mask]
    
    return df


In [32]:
# Translate the main cities in Nagano prefecture and weather changes
city_mapping = {
    '長野': 'Nagano',
    '松本': 'Matsumoto',
    '諏訪': 'Suwa',
    '飯田': 'Iida'
}

weather1_mapping = {
    '不明': 'unknown', 
    '雪': 'snow', 
    '雪のち雨': 'snow and then rain', 
    '雪一時雨': 'sleet', 
    '雨': 'rain', 
    '晴': 'sunny', 
    '曇のち雨': 'cloudy then rain', 
    '晴のち曇': 'cloudy weather', 
    '曇のち晴': 'cloudy then sunny',
    '晴一時雪': 'fine snow', 
    '雨のち晴': 'rain then sunny', 
    '曇': 'cloudy', 
    '雪のち晴': 'snow then sunny', 
    '雨のち曇': 'cloudy after rain', 
    '曇時々晴': 'partly cloudy', 
    '雪のち曇': 'cloudy after snow', 
    '晴一時雨': 'sunny partly rain',
    '晴のち雨': 'sunny after rain'
}

In [33]:
# Cleaning data
cleaned_df = clean_data(df, city_mapping)

In [34]:
cleaned_df.head(5)

Unnamed: 0,city,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,Matsumoto,unknown,21.6℃,8.7℃,0.0mm,05:01,18:31,2018-04-26
1,Matsumoto,unknown,22.7℃,12.3℃,0.0mm,05:00,18:32,2018-04-27
2,Matsumoto,unknown,23.9℃,7.2℃,0.0mm,04:59,18:33,2018-04-28
3,Matsumoto,unknown,28.1℃,8.5℃,0.0mm,04:58,18:34,2018-04-29
4,Matsumoto,unknown,24.7℃,14.5℃,0.0mm,04:57,18:34,2018-04-30


In [35]:
cleaned_df.to_csv('weather.csv')

# Specific Cleaning for the proyect

In [36]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675 entries, 0 to 674
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   city      675 non-null    object        
 1   weather1  675 non-null    object        
 2   max_temp  675 non-null    object        
 3   min_temp  675 non-null    object        
 4   rain      675 non-null    object        
 5   sunrise   675 non-null    object        
 6   sunset    675 non-null    object        
 7   date      675 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(7)
memory usage: 47.5+ KB


In [37]:
# Dropped as too many nan values but this column will enrich the data with a better API info.
cleaned_df['weather1'].value_counts()

unknown               590
rain                   52
snow                    5
cloudy then sunny       5
cloudy weather          5
sunny                   4
cloudy then rain        3
cloudy                  3
snow and then rain      2
rain then sunny         1
cloudy after rain       1
sleet                   1
fine snow               1
snow then sunny         1
partly cloudy           1
Name: weather1, dtype: int64

In [38]:
#cleaned_df['rain'].value_counts()

In [39]:
# Not relevant info for the project
cleaned_df.drop(columns = ['weather1', 'sunrise', 'sunset'], inplace = True )

In [40]:
# Cleaning colums for Temp avg
cleaned_df['max_temp'] = cleaned_df['max_temp'].str.replace('℃', '')
cleaned_df['min_temp'] = cleaned_df['min_temp'].str.replace('℃', '')
cleaned_df['max_temp'] = cleaned_df['max_temp'].astype(float)
cleaned_df['min_temp'] = cleaned_df['min_temp'].astype(float)



In [41]:
# Obtaining the mean temperature of each day
cleaned_df['temp'] = cleaned_df[["max_temp", "min_temp"]].mean(axis=1).round(1)

In [42]:
# Cleaning columns of rain
cleaned_df['rain'] = cleaned_df['rain'].str.replace('mm', '')
cleaned_df['rain'] = cleaned_df['rain'].astype(float)

In [43]:
# Dropping not necessary columns after cleaning
cleaned_df.drop(columns = ['city', 'max_temp', 'min_temp'], inplace = True )

In [44]:
cleaned_df = cleaned_df[['date', 'temp', 'rain']]

In [45]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675 entries, 0 to 674
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    675 non-null    datetime64[ns]
 1   temp    675 non-null    float64       
 2   rain    675 non-null    float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 21.1 KB


In [46]:
cleaned_df.head()

Unnamed: 0,date,temp,rain
0,2018-04-26,15.2,0.0
1,2018-04-27,17.5,0.0
2,2018-04-28,15.6,0.0
3,2018-04-29,18.3,0.0
4,2018-04-30,19.6,0.0


In [47]:
cleaned_df.tail()

Unnamed: 0,date,temp,rain
670,2020-02-25,2.8,7.0
671,2020-02-26,4.4,1.0
672,2020-02-27,1.8,0.0
673,2020-02-28,1.1,0.0
674,2020-02-29,6.0,0.0


In [48]:
cleaned_df.to_csv('weather_prophet18-20.csv')