# Historical DBF to CSV

## Import

In [27]:
import numpy as np
import pandas as pd
import os
import glob

In [3]:
!pip install simpledbf

Collecting simpledbf
  Downloading simpledbf-0.2.6.tar.gz (17 kB)
Building wheels for collected packages: simpledbf
  Building wheel for simpledbf (setup.py): started
  Building wheel for simpledbf (setup.py): finished with status 'done'
  Created wheel for simpledbf: filename=simpledbf-0.2.6-py3-none-any.whl size=13796 sha256=d6bbacdb600519f815a6ce36e89f78006b4fc7b651bcf0775df596a897ec07d9
  Stored in directory: d:\users\dascencr\appdata\local\pip\cache\wheels\7c\4a\85\8d25e6dc4d2968a93a5459ed0153fd6f67041cfece137ca9ce
Successfully built simpledbf
Installing collected packages: simpledbf
Successfully installed simpledbf-0.2.6


In [4]:
from simpledbf import Dbf5

## Test

In [5]:
dbf = Dbf5('../files/00_raw_download/2019_ENDES/Modulo569_programas_sociales/PS_VL.dbf', codec='utf-8')

In [6]:
dbf.numrec

9183

In [7]:
dbf.fields

[('DeletionFlag', 'C', 1),
 ('ID1', 'N', 4),
 ('HHID', 'C', 15),
 ('HVIDX', 'N', 2),
 ('QHCLUSTER', 'N', 4),
 ('QHNUMBER', 'N', 3),
 ('QHHOME', 'N', 2),
 ('PS102_1A', 'N', 2),
 ('PS102_1M', 'N', 2),
 ('PS102_1S', 'N', 1),
 ('PS4', 'N', 1)]

In [8]:
dbf.mem()

This total process would require more than 0.6481 MB of RAM.


In [9]:
dbf.to_csv('../files/07_tmp/drop.csv')

In [56]:
data = pd.io.stata.read_stata('../files/00_raw_download/2015_ENDES/Modulo66/rec0111.dta')

In [57]:
data.head()

Unnamed: 0,caseid,v001,v002,v003,v004,v006,v007,v008,v009,v010,...,v160,v161,v166,v167,v168,ml101,v190,v191,v005,hv005x
0,000102701 1,1,27,1,1,6,2015,1386,7.0,1985.0,...,,lpg,no salt,,,,poorest,-124225,157692.0,147527
1,000104301 1,1,43,1,1,6,2015,1386,4.0,1974.0,...,no,lpg,30 ppm,,,,poorer,15786,157692.0,147527
2,000104801 2,1,48,2,1,6,2015,1386,1.0,1980.0,...,no,lpg,7 ppm,,,,middle,31424,723389.0,147527
3,000104801 3,1,48,3,1,6,2015,1386,11.0,1999.0,...,no,lpg,7 ppm,,,,middle,31424,723389.0,147527
4,000105001 3,1,50,3,1,6,2015,1386,8.0,1993.0,...,no,lpg,30 ppm,,,,middle,45019,723389.0,147527


In [59]:
data.to_csv('../files/07_tmp/drop_ga.csv',index=False)

## Define functions

In [76]:
def dbf_to_csv(path_dbf, path_csv):
    dbf = Dbf5(path_dbf, codec='ISO-8859-1')
    dbf.to_csv(path_csv)   

In [77]:
def dta_to_csv(path_dta, path_csv):
    data = pd.io.stata.read_stata(path_dta)
    data.to_csv(path_csv,index=False)

## For everything

In [13]:
csv_path = '../files/00_raw_download/'

In [71]:
drop_path = '../files/01_csv/csv_by_year/'

In [14]:
filenames_mapped =[os.path.splitext(f)[0] for f in  os.listdir(csv_path)]
filenames_mapped

['2009_ENDES',
 '2010_ENDES',
 '2011_ENDES',
 '2012_ENDES',
 '2013_ENDES',
 '2014_ENDES',
 '2015_ENDES',
 '2016_ENDES',
 '2017_ENDES',
 '2018_ENDES',
 '2019_ENDES']

In [78]:
for year in filenames_mapped:
    txt_guide = [f for f in glob.glob(f'{csv_path}{year}/*.txt')][0]
    txt_guide = txt_guide.split("\\")[-1]
    my_file = open(f'{csv_path}{year}/{txt_guide}', 'r')
    content_list = [f.strip().replace('\\','/').replace('\"','') for f in my_file.readlines()]
    print(f'Transforming {year}')
    for file_t in content_list:
        filename = file_t.split('/')[-1]
        path = '/'.join(file_t.split('/')[:-1])
        base, ext = filename.split('.')
        base = base.upper().replace(' ','_') 
        if not os.path.exists(f'{drop_path}{path}'): os.makedirs(f'{drop_path}{path}')
            
        if ext == 'dbf':
            dbf_to_csv(f'{csv_path}{file_t}', f'{drop_path}{path}/{base}.csv')
            print(f'{file_t} processed, DBF file')
        else:
            dta_to_csv(f'{csv_path}{file_t}', f'{drop_path}{path}/{base}.csv')
            print(f'{file_t} processed, DTA file')
    

Transforming 2009_ENDES
2009_ENDES/Modulo74/rec44.dbf processed, DBF file
2009_ENDES/Modulo73/rec84dv.dbf processed, DBF file
2009_ENDES/Modulo73/rec83.dbf processed, DBF file
2009_ENDES/Modulo72/rec82.dbf processed, DBF file
2009_ENDES/Modulo72/re758081.dbf processed, DBF file
2009_ENDES/Modulo71/re516171.dbf processed, DBF file
2009_ENDES/Modulo70/rec95.dbf processed, DBF file
2009_ENDES/Modulo70/rec43.dbf processed, DBF file
2009_ENDES/Modulo70/rec42.dbf processed, DBF file
2009_ENDES/Modulo69/rec94.dbf processed, DBF file
2009_ENDES/Modulo69/rec41.dbf processed, DBF file
2009_ENDES/Modulo67/rec21.dbf processed, DBF file
2009_ENDES/Modulo67/re223132.dbf processed, DBF file
2009_ENDES/Modulo66/REC0111.dbf processed, DBF file
2009_ENDES/Modulo66/REC91.dbf processed, DBF file
2009_ENDES/Modulo65/rech23.dbf processed, DBF file
2009_ENDES/Modulo65/rech8.dbf processed, DBF file
2009_ENDES/Modulo64/rech8.dbf processed, DBF file
2009_ENDES/Modulo64/rech4.dbf processed, DBF file
2009_ENDES/M