# Creating Historical Database

In [1]:
import os
import tqdm
import zipfile
import numpy as np
import pandas as pd

# speeds up pandas
# reference: https://towardsdatascience.com/one-word-of-code-to-stop-using-pandas-so-slowly-793e0a81343c
#import swifter

from collections import defaultdict

In [2]:
datapath = './data'
unzip_path = './unzipped_data'

<br>

## Getting layout info

In [3]:
header = pd.read_csv('layout_header.csv', sep=';', encoding='utf-8-sig', index_col=0)
layout = pd.read_csv('layout_table.csv', sep=';', encoding='utf-8-sig', index_col=0)

### preparing info columns

In [4]:
# column with variable names
layout['PREFIX'] = layout['NAME'].apply(lambda x:str(x).split(' -')[0])  # short name for each column

<br>

## Importing historical data

### unzipping files

In [5]:
zipNames = os.listdir(datapath)

In [6]:
# extracting zip files
for file in zipNames:
    with zipfile.ZipFile(f'./data/{file}', 'r') as zip_ref:
        zip_ref.extractall('./unzipped_data')

### Importing into pandas dataframe

In [7]:
unzipNames = os.listdir(unzip_path)

In [8]:
df = pd.read_table(f'./unzipped_data/{unzipNames[0]}')

In [9]:
# df.head()

## parsing layout to database

### header

In [10]:
h = df.columns[0].strip()  # header string (column name of df)

head = defaultdict()
for row in range(header.shape[0]):
    init = header.loc[row, 'INIT'] - 1  # start position
    end = header.loc[row, 'END']        # end position (p.s: Pythonic way of indexing ;)
    
    field = header.loc[row,'NAME']      # field name
    
    head[field] = h[init:end].strip()   # values (also removing spaces)

head = dict(head)                       # changing from defaultdict to regular dict

### data

tried to use swifter in this part, but pandas apply is still too slow.

<font color='red' size='3'>**p.s.:** vectorization with numpy seems to be the best solution <font>

In [11]:
def get_values(df, layout, prefixes="all"):
    '''
    Parses a single-column dataframe to multiple column one according the layout passed.
    
    Arguments:
        - df (pandas.DataFrame): single-column dataframe with all historical data (just read the historical 
            data text file into pandas dataframe with pandas.readtable and parse it entirely)
        - layout (pandas.DataFrame): layout table
        - prefixes (list): list of prefixes for desired columns/fields names
    
    Output:
        Returns a big dictionary in the following format:
            {
                prefix1:[values_for_prefix1],
                prefix2:[values_for_prefix2]
            }
    
    p.s.: to transform it into a dataframe, just pass it inside the command pd.DataFrame().
    '''
    
    old_col = df.columns[0] # old column name
    values_dict = defaultdict()    # defaultdict to store all prefixes values
   
    # defines prefixes list to all available in the layout table (if a list was not passed by the user)
    if prefixes == "all":
        prefixes = layout['PREFIX']
        
    for prefix in tqdm.tqdm(prefixes):
        field_info = layout[layout['PREFIX'] == prefix]

        init = field_info.loc[:,'INIT'].values[0] - 1        # start position
        end = field_info.loc[:,'END'].values[0]              # end position (p.s: Pythonic way of indexing ;)

        values_dict[prefix] = []    # default value is an empty list (the real values will be appended to it)
        for i in range(df.shape[0]):
            values_dict[prefix].append(df[old_col].values[i][init:end])
        
    return dict(values_dict)

In [14]:
data_df = get_values(df, layout)
data_df = pd.DataFrame(data_df)

100%|██████████| 26/26 [00:23<00:00,  1.12it/s]


### exporting data

In [15]:
# metadata
file_name = unzipNames[0]
file_name = file_name.split('.')[0]

with open(f'./output/{file_name}-metadata.txt','w') as f:
    f.write(str(head))

data_df.to_csv(f'./output/{file_name}.csv', sep=';', encoding='utf-8-sig')

**end**