In [3]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('../../src')

from utils.preprocessing import check_negative, recursive_agg

sns.set(
    style="whitegrid",
    rc={"figure.figsize": (12, 8)},
)

In [4]:
df = pd.read_csv('../../data/processed/balances2022.csv',
                 dtype={'ruc': str, 'codigo': str, 'valor': float})


ctns = pd.read_csv('../../data/processed/balances2022_meta.csv',
                   dtype={'codigo': str, 'cuenta': str})

In [5]:
df.dtypes

ruc        object
codigo     object
valor     float64
dtype: object

In [4]:
display(ctns)
display(df)

Unnamed: 0,codigo,cuenta
0,1,ACTIVO
1,101,ACTIVO CORRIENTE
2,10101,EFECTIVO Y EQUIVALENTES DE EFECTIVO
3,1010101,CAJA
4,1010102,INSTITUCIONES FINANCIERAS PÚBLICAS
...,...,...
617,80008,IMPUESTO SOBRE LAS GANANCIAS RELATIVO A OTRO R...
618,80009,OTROS (DETALLAR EN NOTAS)
619,801,RESULTADO INTEGRAL TOTAL DEL AÑO
620,80101,PROPIETARIOS DE LA CONTROLADORA


Unnamed: 0,ruc,codigo,valor
0,190000222001,1,1265640.82
1,190000222001,101,1231938.51
2,190000222001,10101,209109.71
3,190000222001,1010101,125644.46
4,190000222001,1010102,0.00
...,...,...,...
61607851,9999999999999,80008,0.00
61607852,9999999999999,80009,0.00
61607853,9999999999999,801,0.00
61607854,9999999999999,80101,0.00


**Data Quality Checks**

Quality Checks Pipeline

In [5]:
# 1790016919001 for Corporacion Favorita
df_favorita = (df
 .query('ruc=="1790016919001"')
 .merge(
     ctns, 
     how='inner', 
     on='codigo'
 )
 # check the substracting accounts are registered with negative numbers
 .pipe(check_negative)
 # check that immidiate children accounts add up to the value of the larger accounts
 .assign(valor=lambda df_: [recursive_agg(df_, v) 
                            for v in df_.codigo.values])
)

In [6]:
df_favorita

Unnamed: 0,ruc,codigo,valor,cuenta
0,1790016919001,1,2.480404e+09,ACTIVO
1,1790016919001,101,7.016353e+08,ACTIVO CORRIENTE
2,1790016919001,10101,7.283312e+06,EFECTIVO Y EQUIVALENTES DE EFECTIVO
3,1790016919001,1010101,1.342367e+06,CAJA
4,1790016919001,1010102,0.000000e+00,INSTITUCIONES FINANCIERAS PÚBLICAS
...,...,...,...,...
617,1790016919001,80008,0.000000e+00,IMPUESTO SOBRE LAS GANANCIAS RELATIVO A OTRO R...
618,1790016919001,80009,0.000000e+00,OTROS (DETALLAR EN NOTAS)
619,1790016919001,801,0.000000e+00,RESULTADO INTEGRAL TOTAL DEL AÑO
620,1790016919001,80101,0.000000e+00,PROPIETARIOS DE LA CONTROLADORA


**Aggregations and Visualization**

Aggregations and Visualization Pipeline

In [8]:
ciiu = pd.read_csv("../../data/processed/balaces2022_ids.csv")

In [8]:
df_favorita[df_favorita["ruc"]== "1790016919001"]

Unnamed: 0,ruc,codigo,valor,cuenta
0,1790016919001,1,2.480404e+09,ACTIVO
1,1790016919001,101,7.016353e+08,ACTIVO CORRIENTE
2,1790016919001,10101,7.283312e+06,EFECTIVO Y EQUIVALENTES DE EFECTIVO
3,1790016919001,1010101,1.342367e+06,CAJA
4,1790016919001,1010102,0.000000e+00,INSTITUCIONES FINANCIERAS PÚBLICAS
...,...,...,...,...
617,1790016919001,80008,0.000000e+00,IMPUESTO SOBRE LAS GANANCIAS RELATIVO A OTRO R...
618,1790016919001,80009,0.000000e+00,OTROS (DETALLAR EN NOTAS)
619,1790016919001,801,0.000000e+00,RESULTADO INTEGRAL TOTAL DEL AÑO
620,1790016919001,80101,0.000000e+00,PROPIETARIOS DE LA CONTROLADORA


**Calling Industry and Other Companies**

In [2]:
df.dtypes

NameError: name 'df' is not defined

In [4]:
def read_ciiu(year):
    ciiu_df = pd.read_csv(f"../../data/processed/balaces{year}_ids.csv")[["ruc","ciiu"]]
    #Turn ruc into str
    ciiu_df["ruc"] = ciiu_df["ruc"].astype(str)
    return ciiu_df
ciiu = read_ciiu(2022)
ciiu.dtypes

ruc     object
ciiu    object
dtype: object

In [6]:
from typing import Union

def call_industry(mother_df: pd.DataFrame,ciiu_df: pd.DataFrame,ruc: Union[str,int]) -> pd.DataFrame:
    ciiu_str = ciiu_df.query(f"ruc == '{str(ruc)}'")["ciiu"].values[0]
    #Filter mother_df by companies that belong to the same industry as company input in args.
    mother_df = (
        mother_df
        .merge(right = ciiu_df,how = "inner", on = "ruc")
        .set_index("ruc")
        .query(f"ciiu == '{ciiu_str}'")
        .drop("ciiu",axis = 1)
    )
    return mother_df

In [13]:
ciiu = read_ciiu(2022)

In [14]:
poyo = call_industry(mother_df = df,ciiu_df = ciiu, ruc = "1790016919001")

In [17]:
df.index

RangeIndex(start=0, stop=61607856, step=1)

In [20]:
display(len(set(df.set_index("ruc").index)))
display(len(set(poyo.index)))

99048

591

In [1]:
def call_company(mother_df: pd.DataFrame,ruc: str):
    df_company = mother_df.query(f"ruc == '{ruc}'")
    return df_company

NameError: name 'pd' is not defined

In [108]:
get_company(poyo,"1790016919001")

Unnamed: 0_level_0,codigo,valor
ruc,Unnamed: 1_level_1,Unnamed: 2_level_1
1790016919001,1,2.480404e+09
1790016919001,101,7.016353e+08
1790016919001,10101,7.283312e+06
1790016919001,1010101,1.342367e+06
1790016919001,1010102,0.000000e+00
...,...,...
1790016919001,80008,0.000000e+00
1790016919001,80009,0.000000e+00
1790016919001,801,0.000000e+00
1790016919001,80101,0.000000e+00
