In [2]:
import pandas as pd
import os 
import requests
from bs4 import BeautifulSoup

The Main purpouse of this notebook is to show how to scrap air quality index data from Tutiempo website :)

In the above mentioned website we can get details like 

'Day','Average_Temp','Max_Temp','Min_Temp','Atm_Pressure','Average_relative_humidity','Total_rainfall_snowmelt',                'Average_visibility','Average_wind_speed','Maximum_sustained_wind_speed',etc.

If you look at the webpage , you can find out that data is available in html table , so our target is to search for that table and get the data .

To scrap the table , we have two options here 1) Using BeautifulSoup  2) Pandas ( ill prefer pandas :D )


AND  

I'll show you both the ways in a very simple manner 


Let's get started 

### Note : We are scraping data from 2015 to 2020 

# BeautifulSoup

In [9]:
# Let's create an empty dataframe with columns , you can use either full form or short form of columns 

df=pd.DataFrame(columns=['Day','Average_Temp','Max_Temp','Min_Temp','Atm_Pressure','Average_relative_humidity','Total_rainfall_snowmelt',
                         'Average_visibility','Average_wind_speed','Maximum_sustained_wind_speed','VG', 'RA', 'SN', 'TS', 'FG'])

#df=pd.DataFrame(columns=['Day', 'T', 'TM', 'Tm', 'SLP', 'H', 'PP', 'VV', 'V', 'VM', 'VG', 'RA', 'SN', 'TS', 'FG'])



for year in range(2015,2021):
    
    # This will iterate from year 2015 to 2020
    
    for month in range(1,13):
        '''
        This will iterate from month 1 to 12 
        
        
        Note : if you look at url of the webpage you can find out that for months < 10 it is 0{month number } else {month_number}
        
        Ex : March 2017 : https://en.tutiempo.net/climate/03-2017/ws-421820.html     -> 03-2017
        
             Nov 2017 : https://en.tutiempo.net/climate/11-2017/ws-421820.html       -> 11-2017
             
        '''
        
        
        
        if(month < 10 ):        
            
            html=requests.get("https://en.tutiempo.net/climate/0{0}-{1}/ws-421820.html".format(month,year))
        else:
            html=requests.get("https://en.tutiempo.net/climate/{0}-{1}/ws-421820.html".format(month,year))

        # parse the html data using lxml
        
        
        soup = BeautifulSoup(html.text, "lxml")
        
        
        # Now lets look for the table with class as "medias mensuales numspan"

        table = soup.find("table",class_="medias mensuales numspan")
        
        # Now fetch all the rows in the table 

        tr_elements= table.find_all('tr')
        
        # We iterate through each row (tr element ) and get our data which is place between td tags
        
        for i, row in enumerate(tr_elements):
            
            td_elements=row.find_all('td')
            
            # This row_list contains data of the entire of the row
            
            row_list=[]
            
            for table_data in td_elements:
                
                data= table_data.text
                       
                # Some row data has blank values , so replacing blank with NaN 
                
                if( data==''):
                    data="NaN"
                
                row_list.append(data)
                
            # Appending out row list to our dataframe 
            
            if(len(row_list)==15   and   i!=len(tr_elements)-1):
                df.loc[len(df)+1,:]=row_list
                


In [10]:
# Let's look at 70 days data 

df.head(70)

Unnamed: 0,Day,Average_Temp,Max_Temp,Min_Temp,Atm_Pressure,Average_relative_humidity,Total_rainfall_snowmelt,Average_visibility,Average_wind_speed,Maximum_sustained_wind_speed,VG,RA,SN,TS,FG
1,1,14.9,21.6,8.8,1017.4,76,0,0.8,0.9,7.6,-,,,,o
2,2,14.7,21.6,10.2,1018.1,97,5.08,0.6,3.1,9.4,-,o,,,
3,3,15,16.8,13.6,1017.5,94,7.11,0.5,0.9,7.6,-,o,,,o
4,4,,,,,,,,,,,,,,o
5,5,,,,,,,,,,,,,,
6,6,,,,,,,,,,,,,,o
7,7,9.8,17.6,6.4,1016.6,90,0,0.5,3.9,11.1,-,,,,o
8,8,10.1,14.5,7.2,1019.2,90,0,0.3,3.3,7.6,-,,,,o
9,9,10.5,16.3,6.8,1020.1,84,0,0.5,1.5,5.4,-,,,,o
10,10,,,,,,,,,,,,,,o


# Pandas

In [11]:
df2=pd.DataFrame(columns=['Day', 'T', 'TM', 'Tm', 'SLP', 'H', 'PP', 'VV', 'V', 'VM', 'VG', 'RA', 'SN',
                          'TS', 'FG','year','month'])

for year in range(2015,2021):
    
    # This will iterate from year 2015 to 2020
    
    for month in range(1,13):      
        
        # This will iterate from month 1 to 12
        
        # Pandas.read_html is really good at reading tables directly from webpage instead of using BeautifulSoup
               
        if(month < 10 ):        
            
            table=pd.read_html("https://en.tutiempo.net/climate/0{0}-{1}/ws-421820.html".format(month,year))[2]
        else:
            table=pd.read_html("https://en.tutiempo.net/climate/{0}-{1}/ws-421820.html".format(month,year))[2]
            
        # if you look at the table in webpage , you can see last two rows are not having useful information , so dropping them
        
        table.drop(table.tail(2).index,inplace=True)
        
        # Here we are adding year and month to each record so that , in future we can use this to group 
        
        table['year']=year
        table['month']=month
                               
        df2=df2.append(table,ignore_index=True)
        

In [12]:
# Let's look at 70 days data 

df2.head(70)

Unnamed: 0,Day,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG,year,month
0,1,14.9,21.6,8.8,1017.4,76,0,0.8,0.9,7.6,-,,,,o,2015,1
1,2,14.7,21.6,10.2,1018.1,97,5.08,0.6,3.1,9.4,-,o,,,,2015,1
2,3,15,16.8,13.6,1017.5,94,7.11,0.5,0.9,7.6,-,o,,,o,2015,1
3,4,,,,,,,,,,,,,,o,2015,1
4,5,,,,,,,,,,,,,,,2015,1
5,6,,,,,,,,,,,,,,o,2015,1
6,7,9.8,17.6,6.4,1016.6,90,0,0.5,3.9,11.1,-,,,,o,2015,1
7,8,10.1,14.5,7.2,1019.2,90,0,0.3,3.3,7.6,-,,,,o,2015,1
8,9,10.5,16.3,6.8,1020.1,84,0,0.5,1.5,5.4,-,,,,o,2015,1
9,10,,,,,,,,,,,,,,o,2015,1


In [14]:
# you can use below command to store your output in the local file  , if you want index set it to True (default)

df2.to_csv("C:/Users/Daya/Desktop/DataSets/AQI/raw_data.csv",index=False) 