#52 #56 Script to automatize El Salvador

Former-commit-id: dc188f2 Former-commit-id: 2263cd40ef3ec54e0d9648e5087468e35af42525
DataScienceResearchPeru · Apr 7, 2020 · 4b26439 · 4b26439
1 parent 7246d5e
commit 4b26439
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 0 deletions.
diff --git a/utils/scripts/el_salvador_data.py b/utils/scripts/el_salvador_data.py
@@ -0,0 +1,81 @@
+import urllib.request as urllib
+from bs4 import BeautifulSoup
+import json
+import re
+import numpy as np
+import pandas as pd
+from datetime import date
+
+"""
+Variables a cambiar
+"""
+# Scraping
+webpage = "https://e.infogram.com/_/fx5xud0FhM7Z9NS6qpxs?src=embed"
+tag = "script"
+tag_number = 4
+variable_json = "window.infographicData="
+seq = ['elements', 'content', 'content', 'entities', '777cd339-9f0a-4acc-8e3d-7a245cbc1b17',
+       'props', 'chartData', 'data', 0]  # seq to find data
+
+# ISO Code
+iso_file = "https://raw.githubusercontent.com/DataScienceResearchPeru/covid-19_latinoamerica/master/utils/iso3166-2.csv"
+external_subdivisions = ['AHUACHAPÁN', 'CABAÑAS', 'CHALATENANGO', 'CUSCATLÁN', 'LA LIBERTAD', 'LA PAZ',
+                         'LA UNIÓN', 'MORAZÁN', 'SAN MIGUEL', 'SAN SALVADOR', 'SAN VICENTE', 'SANTA ANA', 'SONSONATE', 'USULUTÁN']
+filt_col = ['Code', 'Subdivision Name Used']
+ctryiso = "SV"
+iso_col_name="ISO 3166-2 Code"
+
+
+if __name__ == "__main__":
+    """
+    SCRAP DATA
+    """
+    print("We're going to scrap El Salvador Gov reports to create a pandas dataframe")
+    web = urllib.urlopen(webpage)
+    soup = BeautifulSoup(web.read())
+    data = soup.find_all(tag)[tag_number].string
+    # -1 for ";" at the final
+    dirty_json = data.replace(variable_json, " ")[:-1]
+    clean_json = json.loads(dirty_json)
+    # sequence in json to find data
+    clean_json = clean_json[seq[0]][seq[1]][seq[2]][seq[3]][seq[4]]
+    cases_per_subdivision = clean_json[seq[5]][seq[6]][seq[7]][seq[8]]
+    table = np.array(cases_per_subdivision)
+    # to create a pandas dataframe with the data
+    dataset = pd.DataFrame({table[0, 0]: table[1:, 0],
+                            table[0, 1]: table[1:, 1],
+                            table[0, 2]: table[1:, 2]})
+    print(dataset)
+
+    """
+    ADD ONE COLUM WITH ISO
+    """
+    print("We're going to match ISO CODE")
+    iso = pd.read_csv(iso_file)
+    isocode_el_salvador = iso[iso['Country Code'] == ctryiso][filt_col]
+    isocode_el_salvador['External Subdivision Name'] = external_subdivisions
+    print(isocode_el_salvador)
+
+    """
+    ADDING ISO TO DATA SCRAPPED
+    """
+    print("We're going to add ISO column to data scrapped")
+    #Initialize colum
+    dataset[iso_col_name]=0
+    dataset["Date"]=date.today()
+    for index, row in dataset.iterrows():
+        subdivision=dataset['DEPARTAMENTO'][index]
+        if subdivision!=None:
+            cod=isocode_el_salvador[isocode_el_salvador['External Subdivision Name']==subdivision]['Code']
+        else:
+            print("Please check dataset variable, there's None row. We're going to fix for you. Check row below.")
+            subdivision=dataset['DEPARTAMENTO'][index-1]
+            cod=isocode_el_salvador[isocode_el_salvador['External Subdivision Name']==subdivision]['Code']        
+        dataset[iso_col_name][index]=cod.values[0]
+    print (dataset)
+
+    """
+    CHANGE REPOSITORY FAILS
+    WARNING: TO_CSV TEMPORAL
+    """
+    dataset.to_csv("utils/scripts/el_salvador_temporal/{}.csv".format(date.today()), sep=',')
diff --git a/utils/scripts/el_salvador_temporal/2020-04-06.csv b/utils/scripts/el_salvador_temporal/2020-04-06.csv
@@ -0,0 +1,6 @@
+,DEPARTAMENTO,MUNICIPIO,CANTIDAD DE CASOS,ISO 3166-2 Code,Date
+0,SANTA ANA,SANTA ANA,2,SV-SA,2020-04-06
+1,USULUTÁN,USULUTÁN,1,SV-US,2020-04-06
+2,,SANTA ELENA,1,SV-US,2020-04-06
+3,MORAZÁN,SAN FRANCISCO GOTERA,1,SV-MO,2020-04-06
+4,LA LIBERTAD,COLÓN,1,SV-LI,2020-04-06