In [2]:
# Imports
import pandas as pd
import numpy as np
import psycopg2, os
from tqdm import tqdm


---
## 1. Collecting and processing the data

The data is downloaded as zip files from [Meta's data for good](https://dataforgood.facebook.com/dfg/tools/high-resolution-population-density-maps) platform. Once downloaded, the data is decompressed into the source csv files.

Now we process these files to give them a common format

In [14]:
# For each file
i=0
for file in tqdm(os.listdir("data/population_density/csv/")):
    if file!='.DS_Store':
        
        # Read the data and pars its params
        data = pd.read_csv(f"data/population_density/csv/{file}")
        date = file.split('-')[0][-4:]+'-'+file.split('-')[1]+'-'+file.split('-')[2][:2]
        segment = file.split('-')[0][:-5]
        
        # Add info
        data['date'] = date
        data['segment'] = segment
        
        # To S3
        if 'population_2020' in data.columns:
            data = data.rename(columns={'population_2020': 'population'})
        data[['latitude', 
              'longitude', 
              'population', 
              'date', 
              'segment']].to_csv(f'data/population_density/to_s3/file_{i}.csv', index=False)
        i+=1
        
# Check some of the data created
pd.read_csv(f"data/population_density/population_density/file_0.csv").head()


Unnamed: 0,latitude,longitude,population,date,segment
0,-13.999861,-50.92625,0.773618,2018-10-01,population_bra_northeast
1,-13.999861,-50.832361,0.687046,2018-10-01,population_bra_northeast
2,-13.999861,-49.926806,1.193779,2018-10-01,population_bra_northeast
3,-13.999861,-49.863472,1.00115,2018-10-01,population_bra_northeast
4,-13.999861,-49.764583,0.842562,2018-10-01,population_bra_northeast


Data is uploaded to `s3://postgres-staging-data/population_density/`

---
## 2.  Data to postgres

Establish connection

In [4]:
# Establish connection and create its cursor
try: 
    conn = psycopg2.connect(f"host={os.environ['AURORA_POSTGRES_HOST']} dbname={os.environ['AURORA_POSTGRES_DATABASE']} user={os.environ['AURORA_POSTGRES_USERNAME']} password={os.environ['AURORA_POSTGRES_PWD']}")
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Postgres database")
    print(e)
    

Create table to store this data

In [None]:
# Create the table
cur.execute("CREATE TABLE staging_tables.population_density (latitude numeric, longitude numeric, population bigint, date varchar, segment varchar)")
conn.commit()


Load the data from S3 to the table

In [10]:
for i in range(10):
    statement = f"""
        SELECT aws_s3.table_import_from_s3(
        'staging_tables.population_density',
        'latitude,longitude,population,date, segment',
        '(FORMAT CSV, HEADER true)',
        aws_commons.create_s3_uri(
            'postgres-staging-data',
            'population_density/2021-12/file_{i}.csv',
            'global'
            ),
        aws_commons.create_aws_credentials(
            '{os.environ['AWS_ACCESS_KEY_ID']}',
            '{os.environ['AWS_SECRET_ACCESS_KEY']}'
            )
        );
    """

KeyError: 'AWS_ACCESS_KEY_ID'