In [3]:
import pandas as pd

import sys
sys.path.append('../../src')

from utils.preprocessing import check_negative, recursive_agg

In [4]:
df = pd.read_csv('../../data/processed/balances2022.csv',
                 dtype={'ruc': str, 'codigo': str, 'valor': float})


ctns = pd.read_csv('../../data/processed/balances2022_meta.csv',
                   dtype={'codigo': str, 'cuenta': str})

**Data Quality Checks**

Quality Checks Pipeline

In [5]:
# 1790016919001 for Corporacion Favorita
df_favorita = (df
 .query('ruc=="1790016919001"')
 .merge(
     ctns, 
     how='inner', 
     on='codigo'
 )
 # check the substracting accounts are registered with negative numbers
 .pipe(check_negative)
 # check that immidiate children accounts add up to the value of the larger accounts
 .assign(valor=lambda df_: [recursive_agg(df_, v) 
                            for v in df_.codigo.values])
)

**Aggregations and Visualization**

Aggregations and Visualization Pipeline

In [8]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(
    style="whitegrid",
    rc={"figure.figsize": (12, 8)},
)

In [9]:
df_favorita

Unnamed: 0,ruc,codigo,valor,cuenta
0,1790016919001,1,2.480404e+09,ACTIVO
1,1790016919001,101,7.016353e+08,ACTIVO CORRIENTE
2,1790016919001,10101,7.283312e+06,EFECTIVO Y EQUIVALENTES DE EFECTIVO
3,1790016919001,1010101,1.342367e+06,CAJA
4,1790016919001,1010102,0.000000e+00,INSTITUCIONES FINANCIERAS PÚBLICAS
...,...,...,...,...
617,1790016919001,80008,0.000000e+00,IMPUESTO SOBRE LAS GANANCIAS RELATIVO A OTRO R...
618,1790016919001,80009,0.000000e+00,OTROS (DETALLAR EN NOTAS)
619,1790016919001,801,0.000000e+00,RESULTADO INTEGRAL TOTAL DEL AÑO
620,1790016919001,80101,0.000000e+00,PROPIETARIOS DE LA CONTROLADORA


In [10]:
df_favorita.query("codigo=='401'")

Unnamed: 0,ruc,codigo,valor,cuenta
376,1790016919001,401,2355580000.0,INGRESOS DE ACTIVIDADES ORDINARIAS


In [11]:
df_favorita.query("codigo=='707'")

Unnamed: 0,ruc,codigo,valor,cuenta
608,1790016919001,707,152679100.0,GANANCIA (PÉRDIDA) NETA DEL PERIODO


In [12]:
df_favorita.query("codigo=='501'").valor.sum()

1712662748.61

In [13]:
df_favorita.query("codigo=='401'").valor.sum()

2355580171.13

In [14]:
df_favorita.query("codigo=='401'").valor.sum()

2355580171.13

In [1]:
2355580171.13 - 1712662748.61

642917422.5200002

In [15]:
ciiu = pd.read_csv("../../data/processed/balaces2022_ids.csv")

In [16]:
df_favorita[df_favorita["ruc"]== "1790016919001"]

Unnamed: 0,ruc,codigo,valor,cuenta
0,1790016919001,1,2.480404e+09,ACTIVO
1,1790016919001,101,7.016353e+08,ACTIVO CORRIENTE
2,1790016919001,10101,7.283312e+06,EFECTIVO Y EQUIVALENTES DE EFECTIVO
3,1790016919001,1010101,1.342367e+06,CAJA
4,1790016919001,1010102,0.000000e+00,INSTITUCIONES FINANCIERAS PÚBLICAS
...,...,...,...,...
617,1790016919001,80008,0.000000e+00,IMPUESTO SOBRE LAS GANANCIAS RELATIVO A OTRO R...
618,1790016919001,80009,0.000000e+00,OTROS (DETALLAR EN NOTAS)
619,1790016919001,801,0.000000e+00,RESULTADO INTEGRAL TOTAL DEL AÑO
620,1790016919001,80101,0.000000e+00,PROPIETARIOS DE LA CONTROLADORA


**Calling Industry and Other Companies**

In [17]:
df.dtypes

ruc        object
codigo     object
valor     float64
dtype: object

In [18]:
def read_ciiu(year):
    ciiu_df = pd.read_csv(f"../../data/processed/balaces{year}_ids.csv")[["ruc","ciiu"]]
    #Turn ruc into str
    ciiu_df["ruc"] = ciiu_df["ruc"].astype(str)
    return ciiu_df
ciiu = read_ciiu(2022)
ciiu.dtypes

ruc     object
ciiu    object
dtype: object

In [19]:
from typing import Union

def call_industry(mother_df: pd.DataFrame,ciiu_df: pd.DataFrame,ruc: Union[str,int]) -> pd.DataFrame:
    ciiu_str = ciiu_df.query(f"ruc == '{str(ruc)}'")["ciiu"].values[0]
    #Filter mother_df by companies that belong to the same industry as company input in args.
    mother_df = (
        mother_df
        .merge(right = ciiu_df,how = "inner", on = "ruc")
        .set_index("ruc")
        .query(f"ciiu == '{ciiu_str}'")
        .drop("ciiu",axis = 1)
    )
    return mother_df

In [20]:
ciiu = read_ciiu(2022)

In [21]:
poyo = call_industry(mother_df = df,ciiu_df = ciiu, ruc = "1790016919001")

In [None]:
df.index

RangeIndex(start=0, stop=61607856, step=1)

In [None]:
display(len(set(df.set_index("ruc").index)))
display(len(set(poyo.index)))

99048

591

In [None]:
def call_company(mother_df: pd.DataFrame,ruc: str):
    df_company = mother_df.query(f"ruc == '{ruc}'")
    return df_company

NameError: name 'pd' is not defined

In [None]:
get_company(poyo,"1790016919001")

Unnamed: 0_level_0,codigo,valor
ruc,Unnamed: 1_level_1,Unnamed: 2_level_1
1790016919001,1,2.480404e+09
1790016919001,101,7.016353e+08
1790016919001,10101,7.283312e+06
1790016919001,1010101,1.342367e+06
1790016919001,1010102,0.000000e+00
...,...,...
1790016919001,80008,0.000000e+00
1790016919001,80009,0.000000e+00
1790016919001,801,0.000000e+00
1790016919001,80101,0.000000e+00
