In [1]:
# Imports
import pandas as pd
import numpy as np
import psycopg2, os
from tqdm import tqdm


---
## 1. Collecting and processing the data

The data is downloaded as zip files from [Meta's data for good](https://dataforgood.facebook.com/dfg/tools/high-resolution-population-density-maps) platform. Once downloaded, the data is decompressed into the source csv files.

Now we process these files to give them a common format

In [7]:
# For each file
i=0
for file in tqdm(os.listdir("data/population_density/2021-12/")):
    if file!='.DS_Store':
        
        # Read the data and pars its params
        data = pd.read_csv(f"data/population_density/2021-12/{file}")
        date = file.split('-')[0][-4:]+'-'+file.split('-')[1]+'-'+file.split('-')[2][:2]
        segment = file.split('-')[0][:-5]
        
        # Add info
        data['date'] = date
        data['segment'] = segment
        
        # To S3
        if 'population_2020' in data.columns:
            data = data.rename(columns={'population_2020': 'population'})
        data[['latitude', 
              'longitude', 
              'population', 
              'date', 
              'segment']].to_csv(f'data/population_density/2021-12/file_{i}.csv', index=False)
        i+=1
        
# Check some of the data created
pd.read_csv(f"data/population_density/2021-12/file_1.csv", nrows=5).head()


Unnamed: 0,latitude,longitude,population,date,segment
0,-31.311806,-50.980972,0.996145,2018-10-01,population_bra_southeast
1,-31.278194,-50.942361,1.245182,2018-10-01,population_bra_southeast
2,-31.277917,-50.942361,1.245182,2018-10-01,population_bra_southeast
3,-31.277917,-50.942083,1.245182,2018-10-01,population_bra_southeast
4,-31.277917,-50.941806,1.245182,2018-10-01,population_bra_southeast


Data is uploaded to `s3://postgres-staging-data/population_density/2021-12/`

---
## 2.  Data to postgres

Establish connection

In [2]:
# Establish connection and create its cursor
try: 
    conn = psycopg2.connect(f"host={os.environ['AURORA_POSTGRES_HOST']} dbname={os.environ['AURORA_POSTGRES_DATABASE']} user={os.environ['AURORA_POSTGRES_USERNAME']} password={os.environ['AURORA_POSTGRES_PWD']}")
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Postgres database")
    print(e)
    

### 2.1. Create table and load data
Create table to store this data

In [6]:
# Create the table
cur.execute("CREATE TABLE IF NOT EXISTS staging_tables.stg_population_density (latitude numeric, longitude numeric, population numeric, date varchar, segment varchar)")
conn.commit()


Load the data from S3 to the table.

**IMPORTANT NOTES**:
- Run the statement below for each file the DBeaver editor, won't work from here
- Make sure to refresh your credentials instead of taking them from env variables

In [None]:
# For each file
for i in [1]:
    
    # Create a custom copy statement for the file
    statement = f"""
        SELECT aws_s3.table_import_from_s3(
        'staging_tables.stg_population_density',
        'latitude,longitude,population,date, segment',
        '(FORMAT CSV, HEADER true)',
        aws_commons.create_s3_uri(
            'postgres-staging-data',
            'population_density/2021-12/file_{str(i)}.csv',
            'us-east-1'
            ),
        aws_commons.create_aws_credentials(
            '{os.environ['AWS_ACCESS_KEY_ID']}',
            '{os.environ['AWS_SECRET_ACCESS_KEY']}'
            'IQoJb3JpZ2luX2VjEEsaCXVzLWVhc3QtMiJHMEUCIDB17Y8kR3Y7EkuT5bBb7JcnIXCFU7Y8+LC26hhe0NmBAiEA458mjNaJ3na7iv82Ey80oaLUm3i2Rp+34FERnRE4GacqjgMIRBABGgw4MjczMDMyOTI2NTUiDGsG9GitPTbRN0kKRirrAuAkBVEWIktBDMPMJuHUAYe5PfMPKR7Pur+i1emGucKG7/5Vbs/B8p078KmEg6wPuZCe59nSzI+mlMa3JracBdnNQRIYSNjolZUE6EA533oepgJS6KZkmMaFpi3bf07eRf5yXplI+lUIVdHiR01ZbFGa5xZZ1aOHusNNih0YowIf5FkS3EkZOFT380mRoSCFGZ0fiw3TnkiRuEf7VBQ8Sy2DOAhlhsHKVnQAdiQHOH50wOn7REQoFzaT2IM/MGYQYtSHfmBs8gLygqMxboTk70B9oicnpOQD2SEZLbcDOI9yFxJ+vwUj/1zEIj8XsYS1QWrOIsn3qPzBKntD0SvFTPBAV+e2Vh3+Yu/+nQGMd2kpJsVn06PpP7ebr/xx73qZrRBIa9dEzpp2ImnJgPDHYYPXV09PHXBQBY29rJ1OeuFDDW2z5SsB3tqPvVngGRC1CoF1CXsorTHHNYth5OT5a2B5NC8VvcJlNikjEjDOsZGOBjqmAdd265VhL9qJXluR1pMY37e9YIeR+/JItGPvQ/PBG1YPtaW1A8ZaU3mfNKInNVZcukcJObCLp3JUrB3jsocZqmQ7ssLDQltno7D6Vh9u6fJcljXZgTftI65f5N5aB1PEtQk5acUnynG4ijLYm9OnVLvxbJ6gnD+I5F0mrjd0SHwoMqcsgbhnEuU/zLdj5f0AVYTQABnQHzAn/Iu42qybRKDhtM9PcqI=')
        );
    """
    

### 2.2. Sanity checks

Count all records and compare against files

In [3]:
cur.execute("SELECT segment, COUNT(*) FROM staging_tables.stg_population_density GROUP BY segment")
pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])


Unnamed: 0,segment,count
0,BRA_children_under_five,36318011
1,BRA_elderly_60_plus,36349856
2,BRA_men,36392936
3,BRA_women,36392878
4,BRA_women_of_reproductive_age_15_49,36392004
5,BRA_youth_15_24,36379803
6,population_bra_northeast,9239575
7,population_bra_northwest,2102870
8,population_bra_southeast,18103162
9,population_bra_southwest,7220739


In [2]:
for i in range(10):
    dt = pd.read_csv(f"data/population_density/2021-12/file_{i}.csv")
    print(f"For segment {list(dt.segment.unique())[0]}, read {dt.shape[0]} records")


For segment population_bra_northeast, read 9239575 records
For segment population_bra_southeast, read 18103162 records
For segment BRA_women, read 36392878 records
For segment BRA_elderly_60_plus, read 36349856 records
For segment BRA_children_under_five, read 36318011 records
For segment BRA_youth_15_24, read 36379803 records
For segment population_bra_northwest, read 2102870 records
For segment BRA_women_of_reproductive_age_15_49, read 36392004 records
For segment population_bra_southwest, read 7220739 records
For segment BRA_men, read 36392936 records
