# Segundo Avance Proyecto ETL

#### Presentado por: Carlos Hidalgo Escobar
 *Database Connection*

In [20]:
import yaml
import psycopg2 
from psycopg2 import sql
from sqlalchemy import create_engine, text
import pandas as pd
import json
import numpy as np

In [3]:
#Carga de la confguracion de la base de datos
def load_config(file_path="config.yaml"):
    with open(file_path, "r") as file:
        return yaml.safe_load(file)

In [4]:
config = load_config()
db_config = config["database"]

# Carga de credenciales
db_user = db_config["user"]
db_password = db_config["password"]
db_host = db_config["host"]
db_port = db_config["port"]
db_name = db_config["name"]

# DB connection
conn = psycopg2.connect(
    dbname="postgres",
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)
conn.autocommit = True

In [40]:
#Visualizamos los datos de la base de datos en un dataframe
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

with engine.connect() as conn:
    proye_db_df = pd.read_sql("SELECT * FROM base_proye", conn)
    
proye_db_df

Unnamed: 0,id,objeto_numerico,tipopred,avalpred_vigant,usu_vigant,actividad_vigant,estrato_vigant,area_vigant,terreno_vigant,predial_vigant,...,manzana,tipo_predio,actualizacion,avalpred_vigact,usu_vigact,actividad_vigact,estrato_vigact,area_vigact,terreno_vigact,cartera_vigact
0,1,76001010009080038002,P.V.R.,91742000,6,15,,100,100,100000,...,4,MEJORA,RURAL,96329000,6,15,,100,100,Y
1,2,51000001010200020102,P.V.R.,22873000,6,15,,100,100,16000,...,4,MEJORA,RURAL,24017000,6,15,,100,100,N
2,3,51000001010400010104,P.V.R.,37948000,6,15,,100,100,29000,...,4,MEJORA,RURAL,39845000,6,15,,100,100,N
3,4,51000001010500000105,V.R.,859144000,7,16,,100,100,318000,...,4,NPH,RURAL,902101000,7,16,,100,100,N
4,5,51000001010700000107,P.V.R.,44176000,6,15,,100,100,30000,...,4,NPH,RURAL,46385000,6,15,,100,100,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,02010003007809080055,CONST.,10113000,1,1,6.0,100,100,122000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99996,99997,02010003007909080055,CONST.,10113000,1,1,6.0,100,100,121000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99997,99998,02010003008009080055,CONST.,10113000,1,1,6.0,100,100,111000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99998,99999,02010004007609020004,CONST.,10113000,1,1,6.0,100,100,65000,...,4,PH,URBANO,13248000,1,1,6.0,100,100,N


#### Limpieza de datos

In [41]:
#Borrar todos los registros nulos en la columnas indicadas
proye_db_df = proye_db_df.dropna(subset=["objeto_numerico", "usu_vigant", "actividad_vigant", "avalpred_vigant", "predial_vigant", "avalpred_vigact"])

#proye_db_df = proye_db_df.drop_duplicates(subset=['objeto_numerico'], keep='first')

proye_db_df

Unnamed: 0,id,objeto_numerico,tipopred,avalpred_vigant,usu_vigant,actividad_vigant,estrato_vigant,area_vigant,terreno_vigant,predial_vigant,...,manzana,tipo_predio,actualizacion,avalpred_vigact,usu_vigact,actividad_vigact,estrato_vigact,area_vigact,terreno_vigact,cartera_vigact
0,1,76001010009080038002,P.V.R.,91742000,6,15,,100,100,100000,...,4,MEJORA,RURAL,96329000,6,15,,100,100,Y
1,2,51000001010200020102,P.V.R.,22873000,6,15,,100,100,16000,...,4,MEJORA,RURAL,24017000,6,15,,100,100,N
2,3,51000001010400010104,P.V.R.,37948000,6,15,,100,100,29000,...,4,MEJORA,RURAL,39845000,6,15,,100,100,N
3,4,51000001010500000105,V.R.,859144000,7,16,,100,100,318000,...,4,NPH,RURAL,902101000,7,16,,100,100,N
4,5,51000001010700000107,P.V.R.,44176000,6,15,,100,100,30000,...,4,NPH,RURAL,46385000,6,15,,100,100,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,02010003007809080055,CONST.,10113000,1,1,6.0,100,100,122000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99996,99997,02010003007909080055,CONST.,10113000,1,1,6.0,100,100,121000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99997,99998,02010003008009080055,CONST.,10113000,1,1,6.0,100,100,111000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99998,99999,02010004007609020004,CONST.,10113000,1,1,6.0,100,100,65000,...,4,PH,URBANO,13248000,1,1,6.0,100,100,N


In [42]:
# Obtener valores únicos de una columna
valores_unicos = proye_db_df['usu_vigant'].unique()
valores_unicos2 = proye_db_df['actividad_vigant'].unique()

print(valores_unicos)
print(valores_unicos2)

['6' '7' '1' '8' '9' '10' '2' '14' '16' '4' '3' '5']
['15' '16' '1' '17' '18' '19' '3' '40' '25' '2' '24' '23' '31' '43' '42'
 '9' '11' '4' '5' '10' '7' '6' '13']


#### Transformación
*Estandarización y manejo inicial de la data*

In [43]:
#proye_db_df["largo"] = proye_db_df["objeto_numerico"].str.len()
#proye_db_df.drop(["objeto_numerico"])

df_transformed = proye_db_df.copy()

#Eliminar la columna 'tipo_predio' porque no tiene relevancia en los calculos y puede genera confusion con la columna 'tipopred'
df_transformed = df_transformed.drop(columns=['tipo_predio'])

#Eliminar las columnas porque no tiene relevancia en los calculos.
df_transformed = df_transformed.drop(columns=['area_vigant', 'terreno_vigant','area_vigact', 'terreno_vigact', 'manzana'])

#Se agrega un 0 al inicio de cada objeto_numerico por el estandar que lo define (20 digitos).
df_transformed['objeto_numerico'] = df_transformed['objeto_numerico'].apply(lambda x: '0' + x if len(x) == 19 else x)

#Se agrega un 0 al inicio de los usu y actividad definiendo un estandar.
df_transformed[['usu_vigant', 'actividad_vigant', 'usu_vigact', 'actividad_vigact']] = df_transformed[['usu_vigant', 'actividad_vigant', 'usu_vigact', 'actividad_vigact']].map(lambda x: '0' + x if len(x) == 1 else x)

#Se concatenan el uso y la actividad de la vigencia actual.
df_transformed['Uso_Actividad_vigAct'] = df_transformed['usu_vigact'].str.cat(df_transformed['actividad_vigact'])

#Redondear las columnas de los avaluos al multiplo 1000.
df_transformed['avalpred_vigant'] = df_transformed['avalpred_vigant'].round(-3)
df_transformed['avalpred_vigact'] = df_transformed['avalpred_vigact'].round(-3)

#Agregar columna residencial o no residencial
df_transformed['Tipo_res'] = df_transformed.apply(lambda row: 'Residencial' if row['Uso_Actividad_vigAct'] == "0101" else "No Residencial", axis=1)

df_transformed

Unnamed: 0,id,objeto_numerico,tipopred,avalpred_vigant,usu_vigant,actividad_vigant,estrato_vigant,predial_vigant,comuna,barrio,actualizacion,avalpred_vigact,usu_vigact,actividad_vigact,estrato_vigact,cartera_vigact,Uso_Actividad_vigAct,Tipo_res
0,1,76001010009080038002,P.V.R.,91742000,06,15,,100000,B51,0,RURAL,96329000,06,15,,Y,0615,No Residencial
1,2,51000001010200020102,P.V.R.,22873000,06,15,,16000,B51,0,RURAL,24017000,06,15,,N,0615,No Residencial
2,3,51000001010400010104,P.V.R.,37948000,06,15,,29000,B51,0,RURAL,39845000,06,15,,N,0615,No Residencial
3,4,51000001010500000105,V.R.,859144000,07,16,,318000,B51,0,RURAL,902101000,07,16,,N,0716,No Residencial
4,5,51000001010700000107,P.V.R.,44176000,06,15,,30000,B51,0,RURAL,46385000,06,15,,Y,0615,No Residencial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,02010003007809080055,CONST.,10113000,01,01,6.0,122000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial
99996,99997,02010003007909080055,CONST.,10113000,01,01,6.0,121000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial
99997,99998,02010003008009080055,CONST.,10113000,01,01,6.0,111000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial
99998,99999,02010004007609020004,CONST.,10113000,01,01,6.0,65000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial


In [44]:
#Se carga el json con las tarifas

# Ruta del archivo JSON
ruta_json = "./data/tarifas_predios.json"

# Cargar el JSON como string y luego parsearlo sin alterar los números
with open(ruta_json, "r") as file:
    json_str = file.read()  # Leer como texto
    data = json.loads(json_str, parse_float=str)  # Evita conversión automática a float

# Convertir a DataFrame sin modificar num_tarifa
df_tarifas = pd.DataFrame(data["tarifas_residencial_urbano_rural"])

# Convertir listas a strings para mejor lectura
df_tarifas["uso_actividad_vigact"] = df_tarifas["uso_actividad_vigact"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
df_tarifas["estrato_vigact"] = df_tarifas["estrato_vigact"].apply(lambda x: ", ".join(map(str, x)) if isinstance(x, list) else str(x))

print(df_tarifas.dtypes)  
df_tarifas

id_tarifa               object
num_tarifa              object
uso_actividad_vigact    object
estrato_vigact          object
dtype: object


Unnamed: 0,id_tarifa,num_tarifa,uso_actividad_vigact,estrato_vigact
0,T1,0.004,"0101, 0615, 1833, 1834",1
1,T2,0.008,"0101, 1935, 2035","2, 3"
2,T3,0.011,"0101, 2137",4
3,T4,0.013,"0101, 2238",5
4,T5,0.014,"0101, 2339, 0918, 1019, 1120, 0940",6
5,T6,0.01,"0101, 0409, 0410, 0411, 0442, 0817, 1423, 1424...",3
6,T7,0.0145,"0202, 0203, 0204, 0306, 0307, 0308, 0512, 0513...",
7,T8,0.01,"1426, 1425",
8,T9,0.016,"0205, 1528, 1732, 1221",
9,T10,0.033,1631,


In [45]:
#Se definen las tarifas que se aplicaran segun el uso_actividad_vigact del predio.
#df_transformed["id_tarifa"] = np.where(df_transformed["id"] == df2["id"], df2["valor"], "otro_valor")

# Realizar un merge para traer la tarifa correspondiente
df_transformed = df_transformed.merge(df_tarifas["id_tarifa"], left_index=True, right_index=True, how="left")
df_transformed = df_transformed.merge(df_tarifas["num_tarifa"], left_index=True, right_index=True, how="left")

# Aplicar la condición
#df_transformed["num_tarifa_merged"] = np.where(df_transformed["Uso_Actividad_vigAct"] == "0101" and df_transformed["estrato_vigact"] == "1", df_transformed["num_tarifa"], np.nan)
#df_transformed["num_tarifa_merged"] = np.where(df_transformed["Uso_Actividad_vigAct"] == "0101", df_transformed["num_tarifa"], np.nan)

df_transformed

Unnamed: 0,id,objeto_numerico,tipopred,avalpred_vigant,usu_vigant,actividad_vigant,estrato_vigant,predial_vigant,comuna,barrio,actualizacion,avalpred_vigact,usu_vigact,actividad_vigact,estrato_vigact,cartera_vigact,Uso_Actividad_vigAct,Tipo_res,id_tarifa,num_tarifa
0,1,76001010009080038002,P.V.R.,91742000,06,15,,100000,B51,0,RURAL,96329000,06,15,,Y,0615,No Residencial,T1,0.004
1,2,51000001010200020102,P.V.R.,22873000,06,15,,16000,B51,0,RURAL,24017000,06,15,,N,0615,No Residencial,T2,0.008
2,3,51000001010400010104,P.V.R.,37948000,06,15,,29000,B51,0,RURAL,39845000,06,15,,N,0615,No Residencial,T3,0.011
3,4,51000001010500000105,V.R.,859144000,07,16,,318000,B51,0,RURAL,902101000,07,16,,N,0716,No Residencial,T4,0.013
4,5,51000001010700000107,P.V.R.,44176000,06,15,,30000,B51,0,RURAL,46385000,06,15,,Y,0615,No Residencial,T5,0.014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,02010003007809080055,CONST.,10113000,01,01,6.0,122000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial,,
99996,99997,02010003007909080055,CONST.,10113000,01,01,6.0,121000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial,,
99997,99998,02010003008009080055,CONST.,10113000,01,01,6.0,111000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial,,
99998,99999,02010004007609020004,CONST.,10113000,01,01,6.0,65000,B02,1,URBANO,13248000,01,01,6.0,N,0101,Residencial,,
