# Process Surveys
We need to process the raw data, so we can use it to scrape images and as a base for our models. From the LSMS surveys we need two files - the one which contains the geovariables (lat and lon of the cluster) and one which contains the consumption. Sometimes it is a bit tricky to get the data, since they are linked through some keys which lays in other files.

In [None]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install WorlBank api
!pip install world-bank-data

In [None]:
cd /content/drive/MyDrive/src

/content/drive/MyDrive/src


In [None]:
from lib.lsms import LSMS
from tqdm import tqdm
import json
import os
import pandas as pd

from typing import List, Set, Dict

In [None]:
# Select the required country (only adapted to africa so far)
continent: str = 'africa'

The LSMS surveys are downloaded as zip files, to avoid the extraction manually we prepared this function.

In [None]:
import zipfile

def unzip(path: str) -> None:
    """Function to unzip all zipped LSMS surveys. 

    Args:
        path (str): directory where to find the countries
        
    """
    os.chdir(path)
    dir = path + 'raw/'

    for country in os.listdir(dir): # Iterate through all countries
      for year in os.listdir(f"{dir}{country}/"): #I terate through all years
        for item in os.listdir(f"{dir}{country}/{year}/"): # Iterate through all items in the folder
          if item.endswith('.zip') or item.endswith('.ZIP'):
            zip_ref = zipfile.ZipFile(f"{dir}{country}/{year}/{item}") # create zipfile object
            zip_ref.extractall(f"{dir}{country}/{year}/{item[:-4]}") # extract file to dir
            zip_ref.close() # close file
            os.remove(f"{dir}{country}/{year}/{item}") # delete zipped file
            break

    # Reset the cd
    os.chdir('/content/drive/MyDrive/src')

# Unzip all the zip files in the folder
dir: str = f'/content/drive/MyDrive/data/continents/{continent}/lsms/'
unzip(dir)

In [None]:
def tza(data: any, year: str, ppp: int) -> pd.DataFrame:
    """Function to process the Tanzania Survey of 2015. The link between the coordinates of the clusters is done through `hh_sec_a.csv`.

    Args:
        data (any): json entry of the current data
        year (str): current year (should be 2015 in the gives case)
        ppp (int): ppp, for nominal and real cons

    Returns:
        pd.DataFrame: processed dataframe

    """
    df: pd.DataFrame = pd.read_csv(f"../{data['cluster_path']}")
    df_hh: pd.DataFrame = pd.read_csv(f"../{data['hh_path']}")
    tmp: pd.DataFrame = df_hh.merge(df, on=["clusterid"])
    name: str = "TZA-Areallyrandomfile39493208943.csv"  # tmp file
    tmp.to_csv(name)
    lsms: LSMS = LSMS(
        "TZA", year, cons_path=f"../{data['cons_path']}", hh_path=name)
    lsms.read_data()
    lsms.process_survey(cons_key=data["cons_key"], hhsize_key=data["hhsize_key"], lat_key=data["lat_key"], lon_key=data["lon_key"],
                        hhid_key=data["hhid_key"], multiply=data["multiply"], rural_key=data["rural_key"], rural_tag=data["rural"], urban_tag=data["urban"])
    lsms.write_processed(f"../data/continents/{continent}/lsms/processed/TZA_{year}.csv")
    os.remove(name) # delete tmp file
    
    return lsms.processed

In [None]:
from geopy.geocoders import Nominatim
def uga(path: str) -> None:
    """Function to process the Uganda Survey of 2016. 
    The link between the coordinates of the districts and the districts themselves is done through geopy.

    Args:
        path (str): path to Uganda 2016 folder

    """

    # Creating a df to be used by the API
    with open(path + "districts.json", "r") as f:
        map_ = json.load(f)
    df: pd.DataFrame = pd.DataFrame.from_dict(map_, orient='index')
    df.columns = ['district']
    df.district = df.district.apply(lambda di: "Uganda, "+di)
    df = df.dropna()

    # Instanciating geolocator:
    geolocator: any = Nominatim(user_agent="https")

    # Add columns for latitude and longitude
    df['Latitude'] = df['district'].apply(lambda x: geolocator.geocode(x).latitude)
    df['Longitude'] = df['district'].apply(lambda x: geolocator.geocode(x).longitude)

    # Adding manually two corner cases not solved by the API (source Wikipedia)
    df.loc["109"] = {'district': 'Uganda, Nakasangola', 'Latitude': 1.3150, 'Longitude' : 32.4650 }
    df.loc["411"] = {'district': 'Uganda, Mbarara', 'Latitude': -0.8758, 'Longitude': 30.2592}

    # Renaming columns
    df = (df.reset_index)()
    df = df.drop(['district'], axis=1).rename(columns = {'index': 'district'})

    # Loading Uganda data
    uganda_2016: pd.DataFrame = pd.read_csv(path + "UGA_2015_UNPS_v02_M_CSV/pov2015_16.csv") 

    # Merging df with uganda data and renaming the columns so that match JSON file keys' fields
    uganda_2016 = uganda_2016.merge(df, on='district')
    uganda_2016 = uganda_2016.rename(columns={'Latitude': 'lat', 'Longitude': 'long'})

    # Save the new dataframe as csv
    uganda_2016.to_csv(path + "UGA_2015_UNPS_v02_M_CSV/pov_mod2015_16.csv", index=False)

Loads json file which contains the rules for processing. Have a look in the Readme.md in the `data/LSMS` folder to understand the structure of the file. It can be extended easily.

In [None]:
with open(f"../data/continents/{continent}/lsms/country_keys.json", "r") as f:
    data = json.load(f)

In [None]:
!pip install pyreadstat

It's convenient to have one large file with all countries included. So we will also save it.

In [None]:
def run(path: str, nominal: bool = True) -> None:
    """
    Helper function to get LSMS. 

    Args:
        path (str): Path to write complete dataframe
        nominal (bool): Set mode for nominal or real consumption 
    
    """

    if nominal:
        ppp: int = 1
    else:
        ppp: int = -1

    master_df: pd.DataFrame = pd.DataFrame()

    for country in data:
        for year in data[country]:
            cur = data[country][year]
            if cur["special"]:
                if country == "TZA" and year == "2014":
                    tmp_df: pd.DataFrame = tza(cur, year, ppp)
                    master_df = pd.concat([master_df,tmp_df])
                    continue
                if country == "UGA" and year == "2016":
                    uga(f"../data/continents/{continent}/lsms/raw/Uganda/2016/")

            lsms = LSMS(country, year, cons_path=f"../{cur['cons_path']}", hh_path=f"../{cur['hh_path']}", ppp=ppp)
            lsms.read_data()
            lsms.process_survey(cons_key=cur["cons_key"], hhsize_key=cur["hhsize_key"], lat_key=cur["lat_key"], lon_key=cur["lon_key"], hhid_key=cur["hhid_key"], rural_key=cur["rural_key"], rural_tag=cur["rural"], urban_tag=cur["urban"],multiply=cur["multiply"])
            
            if nominal:
                ending: str = "nominal"
            else:
                ending: str = "real" 
                
            lsms.write_processed(f"../data/continents/{continent}/lsms/processed/{country}_{year}_{ending}.csv")
            master_df = pd.concat([master_df,lsms.processed])

    # Save to csv
    master_df.to_csv(path, index=False)

In [None]:
run(f"../data/continents/{continent}/lsms/processed/_all_nominal.csv")
run(f"../data/continents/{continent}/lsms/processed/_all_real.csv", False)