# Proyecto_Predios

## Extract stage

Dando comienzo a la primera etapa del proceso de ETL, se realiza primeramente la conexion con el servidor de PostgreSQL y asi crear la base de datos (proye_etl_db) que manejara la etapa de Staging y la tabla donde se almacenara la primera carga de los datos (base_proye)

 *Database Connection*

In [1]:
import yaml
import psycopg2 
from psycopg2 import sql
from sqlalchemy import create_engine, text
import pandas as pd

In [2]:
def load_config(file_path="config.yaml"):
    with open(file_path, "r") as file:
        return yaml.safe_load(file)

In [3]:
config = load_config()
db_config = config["database"]

# Carga de credenciales
db_user = db_config["user"]
db_password = db_config["password"]
db_host = db_config["host"]
db_port = db_config["port"]
db_name = db_config["name"]

# DB connection
conn = psycopg2.connect(
    dbname="postgres",
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)
conn.autocommit = True

In [4]:
db_name = "proye_etl_db"
try:
    with conn.cursor() as cur:
        cur.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name)))
        print(f"Base de datos '{db_name}' creada exitosamente.")
except psycopg2.errors.DuplicateDatabase:
    print(f"La base de datos '{db_name}' ya existe.")
finally:
    conn.close()

Base de datos 'proye_etl_db' creada exitosamente.


In [57]:
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

with engine.connect() as conn:
    conn.execute(text("""
        CREATE TABLE IF NOT EXISTS base_proye (
            id SERIAL PRIMARY KEY,
            OBJETO_NUMERICO VARCHAR(100),
            TIPOPRED VARCHAR(50),
            AVALPRED_VIGANT BIGINT,
            USU_VIGANT VARCHAR(10),
            ACTIVIDAD_VIGANT VARCHAR(10),
            ESTRATO_VIGANT VARCHAR(10),
            AREA_VIGANT VARCHAR(10),
            TERRENO_VIGANT VARCHAR(10),
            PREDIAL_VIGANT BIGINT,
            COMUNA VARCHAR(10),
            BARRIO VARCHAR(10),
            MANZANA VARCHAR(10),
            TIPO_PREDIO VARCHAR(50),
            ACTUALIZACION VARCHAR(50),
            AVALPRED_VIGACT BIGINT,
            USU_VIGACT VARCHAR(10),
            ACTIVIDAD_VIGACT VARCHAR(10),
            ESTRATO_VIGACT VARCHAR(10),
            AREA_VIGACT VARCHAR(10),
            TERRENO_VIGACT VARCHAR(10),
            CARTERA_VIGACT CHAR
        );
    """))
    conn.commit()  # Asegúrate de confirmar los cambios
    print("Tabla 'base_proye' creada exitosamente en PostgreSQL.")

Tabla 'base_proye' creada exitosamente en PostgreSQL.


## Staging

Se realiza la etapa de staging que conlleva cargar el dataset, realizando un pre-transform de los datos para cargarlos a la base de datos. 

In [None]:
df = pd.read_csv('./data/predios_dataset.csv', delimiter=";")

In [24]:
df.head(10)

Unnamed: 0,OBJETO_NUMERICO,TIPOPRED,AVALPRED_VIGANT,USU_VIGANT,ACTIVIDAD_VIGANT,ESTRATO_VIGANT,AREA_VIGANT,TERRENO_VIGANT,PREDIAL_VIGANT,COMUNA,...,MANZANA,TIPO_PREDIO,ACTUALIZACION,AVALPRED_VIGACT,USU_VIGACT,ACTIVIDAD_VIGACT,ESTRATO_VIGACT,AREA_VIGACT,TERRENO_VIGACT,CARTERA_VIGACT
0,76001010009080038002,P.V.R.,91742000,6,15,,100,100,100000,B51,...,4,MEJORA,RURAL,96329000,6,15,,100,100,Y
1,51000001010200020102,P.V.R.,22873000,6,15,,100,100,16000,B51,...,4,MEJORA,RURAL,24017000,6,15,,100,100,N
2,51000001010400010104,P.V.R.,37948000,6,15,,100,100,29000,B51,...,4,MEJORA,RURAL,39845000,6,15,,100,100,N
3,51000001010500000105,V.R.,859144000,7,16,,100,100,318000,B51,...,4,NPH,RURAL,902101000,7,16,,100,100,N
4,51000001010700000107,P.V.R.,44176000,6,15,,100,100,30000,B51,...,4,NPH,RURAL,46385000,6,15,,100,100,Y
5,13950029002100020021,CONST.,17761000,1,1,1.0,100,100,71000,B13,...,29,MEJORA,URBANO,18827000,1,1,1.0,100,100,Y
6,03020006001309020002,CONST.,297193000,1,1,5.0,100,100,3864000,B03,...,6,PH,URBANO,389323000,1,1,5.0,100,100,N
7,05970008000000000025,CONST.,72635000,1,1,3.0,100,100,726000,B05,...,8,PH,URBANO,76993000,1,1,3.0,100,100,N
8,63000003008800010088,V.R.,62301000,7,16,,100,100,354000,B63,...,1,MEJORA,RURAL,65416000,7,16,,100,100,Y
9,63000003008800020088,P.V.R.,29425000,6,15,,100,100,5000,B63,...,1,MEJORA,RURAL,30896000,6,15,,100,100,Y


*Por el momento solo se realiza el cambio de mayus a minusculas de los nombres de las columnas, para asi, continuar con la carga de los datos a la base de datos para la etapa de Staging.*

In [49]:
df.columns= df.columns.str.lower()

df.head(10)

Unnamed: 0,objeto_numerico,tipopred,avalpred_vigant,usu_vigant,actividad_vigant,estrato_vigant,area_vigant,terreno_vigant,predial_vigant,comuna,...,manzana,tipo_predio,actualizacion,avalpred_vigact,usu_vigact,actividad_vigact,estrato_vigact,area_vigact,terreno_vigact,cartera_vigact
0,76001010009080038002,P.V.R.,91742000,6,15,,100,100,100000,B51,...,4,MEJORA,RURAL,96329000,6,15,,100,100,Y
1,51000001010200020102,P.V.R.,22873000,6,15,,100,100,16000,B51,...,4,MEJORA,RURAL,24017000,6,15,,100,100,N
2,51000001010400010104,P.V.R.,37948000,6,15,,100,100,29000,B51,...,4,MEJORA,RURAL,39845000,6,15,,100,100,N
3,51000001010500000105,V.R.,859144000,7,16,,100,100,318000,B51,...,4,NPH,RURAL,902101000,7,16,,100,100,N
4,51000001010700000107,P.V.R.,44176000,6,15,,100,100,30000,B51,...,4,NPH,RURAL,46385000,6,15,,100,100,Y
5,13950029002100020021,CONST.,17761000,1,1,1.0,100,100,71000,B13,...,29,MEJORA,URBANO,18827000,1,1,1.0,100,100,Y
6,03020006001309020002,CONST.,297193000,1,1,5.0,100,100,3864000,B03,...,6,PH,URBANO,389323000,1,1,5.0,100,100,N
7,05970008000000000025,CONST.,72635000,1,1,3.0,100,100,726000,B05,...,8,PH,URBANO,76993000,1,1,3.0,100,100,N
8,63000003008800010088,V.R.,62301000,7,16,,100,100,354000,B63,...,1,MEJORA,RURAL,65416000,7,16,,100,100,Y
9,63000003008800020088,P.V.R.,29425000,6,15,,100,100,5000,B63,...,1,MEJORA,RURAL,30896000,6,15,,100,100,Y


In [51]:
df['objeto_numerico'] = df['objeto_numerico'].astype(str)
df['avalpred_vigant'] = df['avalpred_vigant'].astype('int64')
df['avalpred_vigact'] = df['avalpred_vigact'].astype('int64')
df['predial_vigant'] = df['predial_vigant'].astype('int64')

df.head(10)

Unnamed: 0,objeto_numerico,tipopred,avalpred_vigant,usu_vigant,actividad_vigant,estrato_vigant,area_vigant,terreno_vigant,predial_vigant,comuna,...,manzana,tipo_predio,actualizacion,avalpred_vigact,usu_vigact,actividad_vigact,estrato_vigact,area_vigact,terreno_vigact,cartera_vigact
0,76001010009080038002,P.V.R.,91742000,6,15,,100,100,100000,B51,...,4,MEJORA,RURAL,96329000,6,15,,100,100,Y
1,51000001010200020102,P.V.R.,22873000,6,15,,100,100,16000,B51,...,4,MEJORA,RURAL,24017000,6,15,,100,100,N
2,51000001010400010104,P.V.R.,37948000,6,15,,100,100,29000,B51,...,4,MEJORA,RURAL,39845000,6,15,,100,100,N
3,51000001010500000105,V.R.,859144000,7,16,,100,100,318000,B51,...,4,NPH,RURAL,902101000,7,16,,100,100,N
4,51000001010700000107,P.V.R.,44176000,6,15,,100,100,30000,B51,...,4,NPH,RURAL,46385000,6,15,,100,100,Y
5,13950029002100020021,CONST.,17761000,1,1,1.0,100,100,71000,B13,...,29,MEJORA,URBANO,18827000,1,1,1.0,100,100,Y
6,03020006001309020002,CONST.,297193000,1,1,5.0,100,100,3864000,B03,...,6,PH,URBANO,389323000,1,1,5.0,100,100,N
7,05970008000000000025,CONST.,72635000,1,1,3.0,100,100,726000,B05,...,8,PH,URBANO,76993000,1,1,3.0,100,100,N
8,63000003008800010088,V.R.,62301000,7,16,,100,100,354000,B63,...,1,MEJORA,RURAL,65416000,7,16,,100,100,Y
9,63000003008800020088,P.V.R.,29425000,6,15,,100,100,5000,B63,...,1,MEJORA,RURAL,30896000,6,15,,100,100,Y


In [58]:
with engine.connect() as conn:
    df.to_sql("base_proye", con=engine, if_exists="append", index=False)

print("Data guardada en la tabla 'base_proye' exitosamente.")

Data guardada en la tabla 'base_proye' exitosamente.


In [59]:
with engine.connect() as conn:
    proye_db_df = pd.read_sql("SELECT * FROM base_proye", conn)
    
proye_db_df

Unnamed: 0,id,objeto_numerico,tipopred,avalpred_vigant,usu_vigant,actividad_vigant,estrato_vigant,area_vigant,terreno_vigant,predial_vigant,...,manzana,tipo_predio,actualizacion,avalpred_vigact,usu_vigact,actividad_vigact,estrato_vigact,area_vigact,terreno_vigact,cartera_vigact
0,1,76001010009080038002,P.V.R.,91742000,6,15,,100,100,100000,...,4,MEJORA,RURAL,96329000,6,15,,100,100,Y
1,2,51000001010200020102,P.V.R.,22873000,6,15,,100,100,16000,...,4,MEJORA,RURAL,24017000,6,15,,100,100,N
2,3,51000001010400010104,P.V.R.,37948000,6,15,,100,100,29000,...,4,MEJORA,RURAL,39845000,6,15,,100,100,N
3,4,51000001010500000105,V.R.,859144000,7,16,,100,100,318000,...,4,NPH,RURAL,902101000,7,16,,100,100,N
4,5,51000001010700000107,P.V.R.,44176000,6,15,,100,100,30000,...,4,NPH,RURAL,46385000,6,15,,100,100,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,02010003007809080055,CONST.,10113000,1,1,6.0,100,100,122000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99996,99997,02010003007909080055,CONST.,10113000,1,1,6.0,100,100,121000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99997,99998,02010003008009080055,CONST.,10113000,1,1,6.0,100,100,111000,...,3,PH,URBANO,13248000,1,1,6.0,100,100,N
99998,99999,02010004007609020004,CONST.,10113000,1,1,6.0,100,100,65000,...,4,PH,URBANO,13248000,1,1,6.0,100,100,N
