# Process Surveys
We need to process the raw data, so we can use it to scrape images and as a base for our models. From the LSMS surveys we need two files - the one which contains the geovariables (lat and lon of the cluster) and one which contains the consumption. Sometimes it is a bit tricky to get the data, since they are linked through some keys which lays in other files.

In [1]:
%cd ..
%reload_ext autoreload
%autoreload 2

/Users/anashakim/Desktop/Machine Learning/Master/MLProject/predicting-poverty-through-time/src


In [2]:
from lib.lsms import LSMS
from tqdm import tqdm
import json
import os
import pandas as pd

In [3]:
def tza(data: any, year: str, ppp: int) -> pd.DataFrame:
    """Function to process the Tanzania Survey of 2015. The link between the coordinates of the clusters is done through `hh_sec_a.csv`.

    Args:
        data (any): json entry of the current data
        year (str): current year (should be 2015 in the gives case)
        ppp (int): ppp, for nominal and real cons

    Returns:
        pd.DataFrame: processed dataframe

    """
    df: pd.DataFrame = pd.read_csv(f"../{data['cluster_path']}")
    df_hh: pd.DataFrame = pd.read_csv(f"../{data['hh_path']}")
    tmp: pd.DataFrame = df_hh.merge(df, on=["clusterid"])
    name: str = "TZA-Areallyrandomfile39493208943.csv"  # tmp file
    tmp.to_csv(name)
    lsms: LSMS = LSMS(
        "TZA", year, cons_path=f"../{data['cons_path']}", hh_path=name)
    lsms.read_data()
    lsms.process_survey(cons_key=data["cons_key"], hhsize_key=data["hhsize_key"], lat_key=data["lat_key"], lon_key=data["lon_key"],
                        hhid_key=data["hhid_key"], multiply=data["multiply"], rural_key=data["rural_key"], rural_tag=data["rural"], urban_tag=data["urban"])
    lsms.write_processed(f"../data/lsms/processed/TZA_{year}.csv")
    os.remove(name)
    return lsms.processed


Loads json file which contains the rules for processing. Have a look in the Readme.md in the `data/LSMS` folder to understand the structure of the file. It can be extended easily.

In [4]:
with open("../data/lsms/country_keys.json", "r") as f:
    data = json.load(f)

It's convenient to have one large file with all countries included. So we will also save it.

In [5]:
def run(path: str, nominal: bool = True):
    """
    Helper function to get LSMS. 

    Args:
        path (str): Path to write complete dataframe
        nominal (bool): Set mode for nominal or real consumption 
    
    """

    if nominal:
        ppp = 1
    else:
        ppp = -1

    master_df: pd.DataFrame = pd.DataFrame()
    for country in tqdm(data):
        for year in data[country]:
            cur = data[country][year]
            if cur["special"]:
                if country == "TZA" and year == "2014":
                    tmp_df: pd.DataFrame = tza(cur, year, ppp)
                    master_df = pd.concat([master_df,tmp_df])
                    continue
            lsms = LSMS(country, year, cons_path=f"../{cur['cons_path']}", hh_path=f"../{cur['hh_path']}", ppp=ppp)
            lsms.read_data()
            lsms.process_survey(cons_key=cur["cons_key"], hhsize_key=cur["hhsize_key"], lat_key=cur["lat_key"], lon_key=cur["lon_key"], hhid_key=cur["hhid_key"], rural_key=cur["rural_key"], rural_tag=cur["rural"], urban_tag=cur["urban"],multiply=cur["multiply"])
            
            if nominal:
                ending = "nominal"
            else:
                ending = "real" 
            lsms.write_processed(f"../data/lsms/processed/{country}_{year}_{ending}.csv")
            master_df = pd.concat([master_df,lsms.processed])
    master_df.to_csv(path, index=False)

In [6]:
def runCountry(country:str, year:str, nominal: bool = True):
    """
    Helper function to get LSMS.

    Args:
        path (str): Path to write complete dataframe
        nominal (bool): Set mode for nominal or real consumption

    """

    if nominal:
        ppp = 1
    else:
        ppp = -1

    cur = data[country][year]
    lsms = LSMS(country, year, cons_path=f"../{cur['cons_path']}", hh_path=f"../{cur['hh_path']}", ppp=ppp)
    lsms.read_data()
    lsms.process_survey(cons_key=cur["cons_key"], hhsize_key=cur["hhsize_key"], lat_key=cur["lat_key"], lon_key=cur["lon_key"], hhid_key=cur["hhid_key"], rural_key=cur["rural_key"], rural_tag=cur["rural"], urban_tag=cur["urban"],multiply=cur["multiply"])

    if nominal:
        ending = "nominal"
    else:
        ending = "real"
    lsms.write_processed(f"../data/lsms/processed/{country}_{year}_{ending}.csv")

In [7]:
run("../data/lsms/processed/_all_nominal.csv")
run("../data/lsms/processed/_all_real.csv", False)

  0%|          | 0/6 [00:00<?, ?it/s]

sav
sav


100%|██████████| 6/6 [00:02<00:00,  2.80it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

sav
sav


100%|██████████| 6/6 [00:05<00:00,  1.01it/s]
