Skip to content
This repository has been archived by the owner on Dec 22, 2022. It is now read-only.

Commit

Permalink
#52 #56 Script to automatize El Salvador
Browse files Browse the repository at this point in the history
Former-commit-id: dc188f2
Former-commit-id: 2263cd40ef3ec54e0d9648e5087468e35af42525
  • Loading branch information
pablodz committed Apr 7, 2020
1 parent 7246d5e commit 4b26439
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 0 deletions.
81 changes: 81 additions & 0 deletions utils/scripts/el_salvador_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import urllib.request as urllib
from bs4 import BeautifulSoup
import json
import re
import numpy as np
import pandas as pd
from datetime import date

"""
Variables a cambiar
"""
# Scraping
webpage = "https://e.infogram.com/_/fx5xud0FhM7Z9NS6qpxs?src=embed"
tag = "script"
tag_number = 4
variable_json = "window.infographicData="
seq = ['elements', 'content', 'content', 'entities', '777cd339-9f0a-4acc-8e3d-7a245cbc1b17',
'props', 'chartData', 'data', 0] # seq to find data

# ISO Code
iso_file = "https://raw.githubusercontent.com/DataScienceResearchPeru/covid-19_latinoamerica/master/utils/iso3166-2.csv"
external_subdivisions = ['AHUACHAPÁN', 'CABAÑAS', 'CHALATENANGO', 'CUSCATLÁN', 'LA LIBERTAD', 'LA PAZ',
'LA UNIÓN', 'MORAZÁN', 'SAN MIGUEL', 'SAN SALVADOR', 'SAN VICENTE', 'SANTA ANA', 'SONSONATE', 'USULUTÁN']
filt_col = ['Code', 'Subdivision Name Used']
ctryiso = "SV"
iso_col_name="ISO 3166-2 Code"


if __name__ == "__main__":
"""
SCRAP DATA
"""
print("We're going to scrap El Salvador Gov reports to create a pandas dataframe")
web = urllib.urlopen(webpage)
soup = BeautifulSoup(web.read())
data = soup.find_all(tag)[tag_number].string
# -1 for ";" at the final
dirty_json = data.replace(variable_json, " ")[:-1]
clean_json = json.loads(dirty_json)
# sequence in json to find data
clean_json = clean_json[seq[0]][seq[1]][seq[2]][seq[3]][seq[4]]
cases_per_subdivision = clean_json[seq[5]][seq[6]][seq[7]][seq[8]]
table = np.array(cases_per_subdivision)
# to create a pandas dataframe with the data
dataset = pd.DataFrame({table[0, 0]: table[1:, 0],
table[0, 1]: table[1:, 1],
table[0, 2]: table[1:, 2]})
print(dataset)

"""
ADD ONE COLUM WITH ISO
"""
print("We're going to match ISO CODE")
iso = pd.read_csv(iso_file)
isocode_el_salvador = iso[iso['Country Code'] == ctryiso][filt_col]
isocode_el_salvador['External Subdivision Name'] = external_subdivisions
print(isocode_el_salvador)

"""
ADDING ISO TO DATA SCRAPPED
"""
print("We're going to add ISO column to data scrapped")
#Initialize colum
dataset[iso_col_name]=0
dataset["Date"]=date.today()
for index, row in dataset.iterrows():
subdivision=dataset['DEPARTAMENTO'][index]
if subdivision!=None:
cod=isocode_el_salvador[isocode_el_salvador['External Subdivision Name']==subdivision]['Code']
else:
print("Please check dataset variable, there's None row. We're going to fix for you. Check row below.")
subdivision=dataset['DEPARTAMENTO'][index-1]
cod=isocode_el_salvador[isocode_el_salvador['External Subdivision Name']==subdivision]['Code']
dataset[iso_col_name][index]=cod.values[0]
print (dataset)

"""
CHANGE REPOSITORY FAILS
WARNING: TO_CSV TEMPORAL
"""
dataset.to_csv("utils/scripts/el_salvador_temporal/{}.csv".format(date.today()), sep=',')
6 changes: 6 additions & 0 deletions utils/scripts/el_salvador_temporal/2020-04-06.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
,DEPARTAMENTO,MUNICIPIO,CANTIDAD DE CASOS,ISO 3166-2 Code,Date
0,SANTA ANA,SANTA ANA,2,SV-SA,2020-04-06
1,USULUTÁN,USULUTÁN,1,SV-US,2020-04-06
2,,SANTA ELENA,1,SV-US,2020-04-06
3,MORAZÁN,SAN FRANCISCO GOTERA,1,SV-MO,2020-04-06
4,LA LIBERTAD,COLÓN,1,SV-LI,2020-04-06

0 comments on commit 4b26439

Please sign in to comment.