# Creating Historical Database

In [1]:
import os
import tqdm
import zipfile
import numpy as np
import pandas as pd

# speeds up pandas
# reference: https://towardsdatascience.com/one-word-of-code-to-stop-using-pandas-so-slowly-793e0a81343c
import swifter

from collections import defaultdict

In [2]:
datapath = './data'
unzip_path = './unzipped_data'

<br>

## Getting layout info

In [3]:
header = pd.read_csv('layout_header.csv', sep=';', encoding='utf-8-sig', index_col=0)
layout = pd.read_csv('layout_table.csv', sep=';', encoding='utf-8-sig', index_col=0)

### preparing info columns

In [4]:
layout['PREFIX'] = layout['NAME'].apply(lambda x:str(x).split(' -')[0])  # short name for each column

<br>

## Importing historical data

### unzipping files

In [5]:
zipNames = os.listdir(datapath)

In [6]:
for file in zipNames:
    with zipfile.ZipFile(f'./data/{file}', 'r') as zip_ref:
        zip_ref.extractall('./unzipped_data')

In [28]:
def get_values(string_row, layout, prefix):
    '''
    applies layout to a row of historical data table.
    
    Arguments:
        - string_row (str): row of historical dataframe (just read the historical data 
            text file into pandas dataframe with pandas.readtable and parse one row at a time)
        - prefix (str): prefix of the column/field name
        - layout (pandas.dataframe): layout table
    
    returns the value of that field for the parsed row.
    '''
    field_info = layout[layout['PREFIX'] == prefix]
    
    init = field_info.loc[:,'INIT'].values[0] - 1        # start position
    end = field_info.loc[:,'END'].values[0]              # end position (p.s: Pythonic way of indexing ;)
    
    return string_row[init:end]



def get_multiple_values(string_row, layout, prefixes="all", index=0):
    '''
    Gets multiple values from a row of historical data table (according to layout).
    
    Arguments:
        - string_row (str): row of historical dataframe (just read the historical data 
            text file into pandas dataframe with pandas.readtable and parse one row at a time)
        - prefixes (str): prefix of the column/field name
        - layout (pandas.dataframe): layout table
    
    returns the value of that field for the parsed row.
    '''
    if prefixes == "all":
        prefixes = layout["PREFIX"]
    
    multiple_values = defaultdict()
    for prefix in prefixes:
        multiple_values[prefix] = get_values(string_row, layout, prefix)
    
    return pd.DataFrame(dict(multiple_values), index=[index])

### Reading lines as textual data

<font color="red">**p.s.:** the code below works, but it's extremely slow (about 1h30min to iterate over the whole file) </font>

In [29]:
unzipNames = os.listdir(unzip_path)

In [35]:
large_df = pd.DataFrame()
with open(f'./unzipped_data/{unzipNames[0]}') as file:
        counter = 0
        for line in tqdm.tqdm(file):
            large_df = pd.concat([large_df,get_multiple_values(line, layout, index=counter)])
            counter+=1
            
            # just to get a sample of working code
            if counter == 100:
                break

99it [00:02, 34.12it/s]


In [36]:
# line 0 is the header and must be treated separately
large_df

Unnamed: 0,TIPREG,DATA DO PREGÃO,CODBDI,CODNEG,TPMERC,NOMRES,ESPECI,PRAZOT,MODREF,PREABE,...,TOTNEG,QUATOT,VOLTOT,PREEXE,INDOPC,DATVEN,FATCOT,PTOEXE,CODISI,DISMES
0,00,COTAHIST,.2,010BOVESPA 2,010,1230,,,,,...,,,,,,,,,,
1,01,20100104,02,ABCB4,010,ABC BRASIL,PN EJ N2,,R$,0000000001200,...,00421,000000000000205500,000000000253269800,0000000000000,0,99991231,0000001,0000000000000,BRABCBACNPR4,109
2,01,20100104,96,ABCB4F,020,ABC BRASIL,PN EJ N2,,R$,0000000001210,...,00017,000000000000000423,000000000000520805,0000000000000,0,99991231,0000001,0000000000000,BRABCBACNPR4,109
3,01,20100104,02,ABNB3,010,ABNOTE,ON ED NM,,R$,0000000001897,...,00240,000000000000289400,000000000553926000,0000000000000,0,99991231,0000001,0000000000000,BRABNBACNOR4,114
4,01,20100104,96,ABNB3F,020,ABNOTE,ON ED NM,,R$,0000000001895,...,00004,000000000000000033,000000000000063159,0000000000000,0,99991231,0000001,0000000000000,BRABNBACNOR4,114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,01,20100104,62,BICB4T,030,BICBANCO,PN EJ N1,016,R$,0000000001230,...,00004,000000000000001600,000000000001968697,0000000000000,0,99991231,0000001,0000000000000,BRBICBACNPR7,110
96,01,20100104,62,BICB4T,030,BICBANCO,PN EJ N1,030,R$,0000000001229,...,00020,000000000000015400,000000000018952163,0000000000000,0,99991231,0000001,0000000000000,BRBICBACNPR7,110
97,01,20100104,10,BIOM10,010,BIOMM,PN REC,,R$,0000000000270,...,00002,000000000000002000,000000000000540000,0000000000000,0,99991231,0000001,0000000000000,BRBIOMR01PR4,100
98,01,20100104,96,BIOM3F,020,BIOMM,ON,,R$,0000000000290,...,00002,000000000000000033,000000000000009570,0000000000000,0,99991231,0000001,0000000000000,BRBIOMACNOR2,101


**end**