# Web-scrapping to get weather information 2019-2020

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta

In [2]:
url_template = 'https://tenki.jp/past/{}/{:02d}/{:02d}/weather/3/23/'

In [3]:
#URL to get the data on 2019/1/1
url = url_template.format(2019, 1, 1) 
html_doc = requests.get(url).text

In [4]:
#soup.prettify()
soup = BeautifulSoup(html_doc)

In [5]:
table = soup.find(lambda tag: tag.name == 'table' and 'past-live-area-pref-list-entries' in tag
                 ['class'])

In [6]:
df = pd.read_html(str(table), na_values=['---'], header=0)[0]

In [7]:
#checking
df 

Unnamed: 0,地点名,天気,天気.1,最高気温,最低気温,日積算降水量(mm),日の出,日の入
0,長野,,晴のち雪,5.6℃,-5.6℃,0.5mm,06:59,16:42
1,松本,,不明,7.0℃,-7.9℃,0.0mm,06:59,16:44
2,諏訪,,不明,5.1℃,-8.5℃,0.0mm,,
3,飯田,,不明,8.1℃,-7.3℃,0.0mm,,
4,軽井沢,,不明,3.7℃,-10.0℃,0.0mm,,


In [8]:
# renaming columns, assigning format
df.drop(df.shape[0] - 1, inplace =True)
df.columns = ['city', 'weather', 'weather1', 'max_temp', 'min_temp', 'rain', 'sunrise', 
              'sunset']
df['date'] = pd.to_datetime('{}/{}/{}'.format(2019, 1, 1))

In [9]:
#to check how is going
df 

Unnamed: 0,city,weather,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,長野,,晴のち雪,5.6℃,-5.6℃,0.5mm,06:59,16:42,2019-01-01
1,松本,,不明,7.0℃,-7.9℃,0.0mm,06:59,16:44,2019-01-01
2,諏訪,,不明,5.1℃,-8.5℃,0.0mm,,,2019-01-01
3,飯田,,不明,8.1℃,-7.3℃,0.0mm,,,2019-01-01


In [10]:
# Automation
def get_weather_for_one_day(year, month, day):
    url = url_template.format(year, month, day)
    html_doc = requests.get(url).text
    soup = BeautifulSoup(html_doc)
    table = soup.find(lambda tag: tag.name == 'table' and 'past-live-area-pref-list-entries'
                     in tag ['class'])
    
    try:
        df = pd.read_html(str(table), na_values=['---'], header=0)[0]
    except Exception as e:
        print(e)
        raise ValueError('No data is available for {}/{:02d}/{:02d}'.format(year, month, day))
    df.drop(df.shape[0] - 1, inplace=True)
    df.columns = ['city', 'weather', 'weather1', 'max_temp', 'min_temp', 'rain', 'sunrise', 
              'sunset']
    df['date'] = pd.to_datetime('{}/{}/{}'.format(year, month, day))
    
    return df

In [11]:
#to download  the weather data from a specific date
def get_weather_from_date(year, month, day):
    date = datetime(year, month, day)
    today = datetime.utcnow()
    df = []
    while (today - date).days >= 1:
        try:
            df.append(get_weather_for_one_day(date.year, date.month, date.day))
        except ValueError as e:
            print(e)
        finally:
            date += timedelta (days=1)
    return pd.concat(df)

In [12]:
# Get all data from 2019 until today
df = get_weather_from_date(2019, 1, 1)

In [13]:
# Security copy
dfc = df.copy

In [14]:
df

Unnamed: 0,city,weather,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,長野,,晴のち雪,5.6℃,-5.6℃,0.5mm,06:59,16:42,2019-01-01
1,松本,,不明,7.0℃,-7.9℃,0.0mm,06:59,16:44,2019-01-01
2,諏訪,,不明,5.1℃,-8.5℃,0.0mm,,,2019-01-01
3,飯田,,不明,8.1℃,-7.3℃,0.0mm,,,2019-01-01
0,長野,,雪のち晴,3.6℃,-1.1℃,3.0mm,07:00,16:43,2019-01-02
...,...,...,...,...,...,...,...,...,...
3,飯田,,晴一時雨,29.3℃,16.0℃,0.5mm,,,2020-05-11
0,長野,,晴のち曇,29.2℃,10.5℃,0.0mm,04:43,18:45,2020-05-12
1,松本,,晴のち曇,27.9℃,9.4℃,0.0mm,04:44,18:45,2020-05-12
2,諏訪,,晴のち曇,25.8℃,8.5℃,0.0mm,,,2020-05-12


In [15]:
# General cleaning data 
def clean_data(df, city_mapping):
    df = df.copy()
    df.city = df.city.map(city_mapping) 
    df.weather1 = df.weather1.map(weather1_mapping) 
    df = df.fillna('Unknown')
    df = df.drop(['weather'], axis = 1) 
    df = df[df.city == 'Matsumoto']
    df = df.reset_index(drop=True)
    start_date = '2019-01-01'
    end_date = '2020-02-29'
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    df = df.loc[mask]
    
    return df


In [16]:
# Translate the main cities in Nagano prefecture and weather changes
city_mapping = {
    '長野': 'Nagano',
    '松本': 'Matsumoto',
    '諏訪': 'Suwa',
    '飯田': 'Iida'
}

weather1_mapping = {
    '不明': 'unknown', 
    '雪': 'snow', 
    '雪のち雨': 'snow and then rain', 
    '雪一時雨': 'sleet', 
    '雨': 'rain', 
    '晴': 'sunny', 
    '曇のち雨': 'cloudy then rain', 
    '晴のち曇': 'cloudy weather', 
    '曇のち晴': 'cloudy then sunny',
    '晴一時雪': 'fine snow', 
    '雨のち晴': 'rain then sunny', 
    '曇': 'cloudy', 
    '雪のち晴': 'snow then sunny', 
    '雨のち曇': 'cloudy after rain', 
    '曇時々晴': 'partly cloudy', 
    '雪のち曇': 'cloudy after snow', 
    '晴一時雨': 'sunny partly rain',
    '晴のち雨': 'sunny after rain'
}

In [17]:
# Cleaning data
cleaned_df = clean_data(df, city_mapping)

In [25]:
cleaned_df.head(5)

Unnamed: 0,city,weather1,max_temp,min_temp,rain,sunrise,sunset,date
0,Matsumoto,unknown,7.0℃,-7.9℃,0.0mm,06:59,16:44,2019-01-01
1,Matsumoto,unknown,4.4℃,-0.7℃,1.5mm,06:59,16:44,2019-01-02
2,Matsumoto,unknown,4.3℃,-6.0℃,0.0mm,06:59,16:45,2019-01-03
3,Matsumoto,unknown,4.7℃,-8.2℃,0.0mm,07:00,16:46,2019-01-04
4,Matsumoto,unknown,10.4℃,-2.4℃,0.0mm,07:00,16:47,2019-01-05


In [19]:
cleaned_df.to_csv('weather.csv')

# Specific Cleaning for the proyect

In [20]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 425 entries, 0 to 424
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   city      425 non-null    object        
 1   weather1  425 non-null    object        
 2   max_temp  425 non-null    object        
 3   min_temp  425 non-null    object        
 4   rain      425 non-null    object        
 5   sunrise   425 non-null    object        
 6   sunset    425 non-null    object        
 7   date      425 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(7)
memory usage: 29.9+ KB


In [27]:
# Dropped as too many nan values but this column will enrich the data with a better API info.
cleaned_df['weather1'].value_counts()

In [26]:
#cleaned_df['rain'].value_counts()

In [28]:
# Not relevant info for the project
cleaned_df.drop(columns = ['weather1', 'sunrise', 'sunset'], inplace = True )

In [33]:
# Cleaning colums for Temp avg
cleaned_df['max_temp'] = cleaned_df['max_temp'].str.replace('℃', '')
cleaned_df['min_temp'] = cleaned_df['min_temp'].str.replace('℃', '')
cleaned_df['max_temp'] = cleaned_df['max_temp'].astype(float)
cleaned_df['min_temp'] = cleaned_df['min_temp'].astype(float)



In [51]:
# Obtaining the mean temperature of each day
cleaned_df['temp'] = cleaned_df[["max_temp", "min_temp"]].mean(axis=1).round(1)

In [55]:
# Cleaning columns of rain
cleaned_df['rain'] = cleaned_df['rain'].str.replace('mm', '')
cleaned_df['rain'] = cleaned_df['rain'].astype(float)

In [58]:
# Dropping not necessary columns after cleaning
cleaned_df.drop(columns = ['city', 'max_temp', 'min_temp'], inplace = True )

In [68]:
cleaned_df = cleaned_df[['date', 'temp', 'rain']]

In [69]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 425 entries, 0 to 424
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    425 non-null    datetime64[ns]
 1   temp    425 non-null    float64       
 2   rain    425 non-null    float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 13.3 KB


In [71]:
cleaned_df.head()

Unnamed: 0,date,temp,rain
0,2019-01-01,-0.5,0.0
1,2019-01-02,1.8,1.5
2,2019-01-03,-0.8,0.0
3,2019-01-04,-1.7,0.0
4,2019-01-05,4.0,0.0


In [72]:
cleaned_df.tail()

Unnamed: 0,date,temp,rain
420,2020-02-25,2.8,7.0
421,2020-02-26,4.4,1.0
422,2020-02-27,1.8,0.0
423,2020-02-28,1.1,0.0
424,2020-02-29,6.0,0.0


In [73]:
cleaned_df.to_csv('weather_prophet.csv')