In [194]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading
import concurrent.futures
import pymongo
from pymongo import MongoClient
import datetime
import numpy as np

In [170]:
def formFecha(dia,mes):
    diaS = str(dia)
    mesS = str(mes)
    if(len(diaS)==1):
        formDia = '0'+diaS
    else:
        formDia = diaS
    if(len(mesS)==1):
        formMes = '0'+mesS
    else:
        formMes = mesS
    return (formDia,formMes)

In [171]:
def ObtenerTabla(soup,ciudad,pais):
    data=[]
    div = soup.find("div", class_="historical-yearly-data")
    tipo = soup.find('li',class_='active-specie').text
    datos = div.table.tbody.find_all("tr")  #tengo todas las filas de la tabla
    for d in datos:
        year = int(d['key'][:4])
        month = int(d['key'][4:]) +1
        if(year>=2018 and month<=5):
            day = 0
            info = d.find('td',class_='squares').svg.find_all('text')
            for i in info:
                row={}
                row['Ciudad']=ciudad
                row['Pais']=pais
                row['Tipo']=tipo
                row['Value']=i.text
                day +=1
                formato = formFecha(day,month)
                row['Fecha']=f'{formato[0]}/{formato[1]}/{year}'
                data.append(row)
    return data

In [177]:
def SepararxFecha(key,df):
    start_date = datetime.date(2018, 1, 1)
    end_date = datetime.date(2020, 5,22 )
    delta = datetime.timedelta(days=1)
    data=[]
    while start_date<=end_date:
        row={}
        row['Ciudad']=key[0]
        row['Pais']=key[1]
        formato = formFecha(start_date.day,start_date.month)
        fecha = f'{formato[0]}/{formato[1]}/{start_date.year}'
        result = df.loc[df['Fecha']==fecha]
        indices = result.index

        for ind in indices:
            tipo = df.at[ind,'Tipo']
            if('.' in tipo):
                tipo = 'PM25'
            valor = df.at[ind,'Value']
            row[tipo]=valor
            
        row['Fecha']=fecha
        data.append(row)
        start_date +=delta
    return data

In [173]:
muertesDF = pd.read_csv('muertesDF.csv')
urlValidos = pd.read_csv('ciudades.csv')

In [174]:
mergeDatos = pd.merge(muertesDF,urlValidos)
mergeDatos.drop(columns=['estacion','Comentarios'],inplace=True)

In [175]:
urls = mergeDatos[mergeDatos['url'].notnull()].index

In [176]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
driver = webdriver.Chrome('D:\\Documents\\PUCP\\Analisis de Datos\\chromedriver.exe',options = options)
driver.maximize_window()
driver.implicitly_wait(20)
soups = []
ciudades = []
paises=[]
start=time.perf_counter()
for i in urls:
    ciudad = mergeDatos.at[i,'Ciudad']
    pais = mergeDatos.at[i,'Pais']
    url = mergeDatos.at[i,'url']
    driver.get(url)
    element = driver.find_element_by_id('h1header2')
    driver.execute_script("arguments[0].scrollIntoView();", element)
    time.sleep(7)
    contaminantes = driver.find_elements_by_xpath('//*[@id="historic-aqidata-inner"]/div[2]/div[2]/center/ul/li')
    for x in range(len(contaminantes)):
        tipo = contaminantes[x].text
        if(tipo not in ['SO2']):
            contaminantes[x].click()
            time.sleep(2)
            html =  driver.page_source
            soup = BeautifulSoup(html,'lxml')
            soups.append(soup)
            ciudades.append(ciudad)
            paises.append(pais)
finish = time.perf_counter()
print(f"Se demoro {round((finish-start)/60,2)} minutos en terminar.")
driver.close()

Se demoro 18.41 minutos en terminar.


In [178]:
start=time.perf_counter()
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = [executor.submit(ObtenerTabla,soup,ciudad,pais) for (soup,ciudad,pais) in zip(soups,ciudades,paises) ]

    for f in concurrent.futures.as_completed(results):
        dfaux=pd.DataFrame(f.result())
        mergeDatos = pd.merge(mergeDatos,dfaux,how='outer')
finish = time.perf_counter()
print(f"Se demoro {round((finish-start)/60,2)} minutos en terminar.")

Se demoro 0.29 minutos en terminar.


In [179]:
mergeDatos.to_csv('mergeDatos.csv',header=['Ciudad','Pais','Casos','Muertes','url','Tipo','Value','Fecha'])
merge = mergeDatos.to_dict('records')
for i in range(len(merge)):
    merge[i]["_id"]=i

In [180]:
datosLimpios = pd.DataFrame(columns=['Ciudad','Pais','PM25','PM10','O3','NO2','CO','Fecha'])

In [183]:
grupos = mergeDatos.groupby(by=['Ciudad','Pais'])
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = [executor.submit(SepararxFecha,key,mergeDatos.loc[(mergeDatos['Ciudad']==key[0]) & (mergeDatos['Pais']==key[1])]) for key in grupos.groups.keys()]

    for f in concurrent.futures.as_completed(results):
        dfaux=pd.DataFrame(f.result())
        datosLimpios = datosLimpios.append(dfaux,ignore_index=True)
finish = time.perf_counter()
print(f"Se demoro {round((finish-start)/60,2)} minutos en terminar.")

Se demoro 0.1 minutos en terminar.


In [184]:
datosLimpios['Fecha']=pd.to_datetime(datosLimpios['Fecha'],format='%d/%m/%Y')

In [196]:
datosLimpios.replace('-',np.NaN,inplace=True)

In [206]:
datosLimpios['PM25'] = pd.to_numeric(datosLimpios['PM25'],errors='ignore')
datosLimpios['PM10'] = pd.to_numeric(datosLimpios['PM10'],errors='ignore')
datosLimpios['O3'] = pd.to_numeric(datosLimpios['O3'],errors='ignore')
datosLimpios['NO2'] = pd.to_numeric(datosLimpios['NO2'],errors='ignore')
datosLimpios['CO'] = pd.to_numeric(datosLimpios['CO'],errors='ignore')

In [215]:
datosLimpios.to_csv('datosLimpios.csv',header=['Ciudad','Pais','PM25','PM10','O3','NO2','CO','Fecha'])
datos = datosLimpios.to_dict('records')
for i in range(len(datos)):
    datos[i]["_id"]=i

In [216]:
host = "mongodb+srv://dbAnalisis:yhlqmdlg.2020@airpollution-jqwnj.mongodb.net/test?retryWrites=true&w=majority"
cluster = MongoClient(host)
db = cluster['airData']
collection = db['dataHistorica']

start = time.perf_counter()
collection.insert(datos)
finish = time.perf_counter()
print(f'Termino en {round((finish-start)/60,2)} minutos\n')

Termino en 0.18 minutos



In [214]:
datosLimpios.describe()

Unnamed: 0,PM25,PM10,O3,NO2,CO
count,4918.0,4585.0,5046.0,5156.0,4045.0
mean,57.964213,29.171865,30.368609,17.141001,4.575525
std,38.352777,22.83631,14.640769,13.191733,4.93975
min,1.0,1.0,0.0,0.0,0.0
25%,30.0,14.0,21.0,7.0,1.0
50%,50.0,24.0,29.0,13.0,3.0
75%,75.0,39.0,37.0,25.0,6.0
max,305.0,632.0,146.0,83.0,44.0
