In [1]:
import pandas as pd

In [2]:
def get_head(df: pd.DataFrame) -> pd.DataFrame:
    df_head = df.iloc[ 0:15 , 0:2 ]
    df_head = pd.DataFrame([df_head.iloc[:,1].to_list()], columns=[df_head.iloc[:,0].to_list()])

    # Assign dtypes: categorical, string, and numeric
    cat_cols = [ "ALCANCE", "FASE", "ZONA", "TIPO DE TERRENO" ]
    str_cols = [ "NOMBRE DEL PROYECTO", "CÓDIGO DEL PROYECTO", "DEPARTAMENTO" ]
    int_cols = [ "AÑO INICIO", "AÑO FIN", "CANTIDAD UNIDADES FUNCIONALES", "CANTIDAD PUENTES VEHICULARES", "CANTIDAD PUENTES PEATONALES", "CANTIDAD TÚNELES"]
    float_cols = ["LONGITUD DE VÍA (KM)", "TOTAL CAUSADO"]

    df_head[cat_cols] = df_head[cat_cols].astype("category")
    df_head[str_cols] = df_head[str_cols].astype("string")
    df_head[int_cols] = df_head[int_cols].astype("Int64")
    df_head[float_cols] = df_head[float_cols].astype("float")
    
    return df_head

def get_uf(df: pd.DataFrame) -> pd.DataFrame:
    df_uf = df.iloc[ 1:8 , 5: ].iloc[ : , :-1 ]
    df_uf = df_uf.drop(df_uf.columns[[1]], axis=1)
    column_names = [df_uf.iloc[1:, 0].to_list()]
    df_uf_totals = pd.Series(df.iloc[ 2:9 , 6 + df_uf.shape[1]]).reset_index(drop=True)
    
    return df_uf, column_names, df_uf_totals

def get_items(df: pd.DataFrame) -> pd.DataFrame:
    
    columns_names_items = [ "1 - TRANSPORTE", "2 - TRAZADO Y DISEÑO GEOMÉTRICO", "2.1 - INFORMACIÓN GEOGRÁFICA", "2.2 TRAZADO Y DISEÑO GEOMÉTRICO", 
                           "2.3 - SEGURIDAD VIAL", "2.4 - SISTEMAS INTELIGENTES", "3 - GEOLOGÍA", "3.1 - GEOLOGÍA", "3.2 - HIDROGEOLOGÍA", 
                           "4 - SUELOS", "5 - TALUDES", "6 - PAVIMENTO", "7 - SOCAVACIÓN", "8 - ESTRUCTURAS", "9 - TÚNELES", "10 - URBANISMO Y PAISAJISMO", 
                           "11 - PREDIAL", "12 - IMPACTO AMBIENTAL", "13 - CANTIDADES", "14 - EVALUACIÓN SOCIOECONÓMICA", "OTROS - MANEJO DE REDES" ]
    
    df_items = df.iloc[ 17:, 0:6 ]
    df_items = pd.DataFrame([df_items.iloc[1:,5].to_list()], columns=[columns_names_items]) 
    return df_items


In [3]:
def assemble_sheet(df: pd.DataFrame) -> pd.DataFrame:

    df_head = get_head(df)
    df_uf, column_names, df_uf_totals = get_uf(df)
    df_items = get_items(df)
    rows = []
    
    #Create a row for each functional unit
    for i in range(1, df_uf.shape[1]):
        
        #Aggregate longitud, puentes, tuneles for the current functional unit
        df_uf_x = pd.DataFrame([df_uf.iloc[1:,i].to_list()], columns=column_names)  
        df_uf_x['NOMBRE UF'] = df_uf.iloc[0, i]
        
        df_items_for_functional_unit = df_items / 1 # Future consideration divide  by df_uf_totals
        
        row = pd.concat([df_head, df_uf_x, df_items_for_functional_unit], axis=1)
        rows.append(row)
        
    return pd.concat(rows, axis=0, ignore_index=True)


In [4]:
def assemble_project(filename: str) -> pd.DataFrame:
    with pd.ExcelFile(filename, engine="openpyxl") as xls:
        
        project_names = [project_name for project_name in xls.sheet_names if project_name.isnumeric()]
        df_project =[]

        for project_name in project_names:
            df = pd.read_excel(filename, sheet_name=project_name, header=None, engine="openpyxl")
            df_project.append(assemble_sheet(df))

    return pd.concat(df_project, axis=0, ignore_index=True)



In [5]:
filename = "../data/raw/BASE DE DATOS PRESUPUESTOS.xlsx"
df = assemble_project(filename)

In [6]:
df

Unnamed: 0,NOMBRE DEL PROYECTO,CÓDIGO DEL PROYECTO,ALCANCE,FASE,ZONA,DEPARTAMENTO,AÑO INICIO,AÑO FIN,LONGITUD DE VÍA (KM),TIPO DE TERRENO,...,6 - PAVIMENTO,7 - SOCAVACIÓN,8 - ESTRUCTURAS,9 - TÚNELES,10 - URBANISMO Y PAISAJISMO,11 - PREDIAL,12 - IMPACTO AMBIENTAL,13 - CANTIDADES,14 - EVALUACIÓN SOCIOECONÓMICA,OTROS - MANEJO DE REDES
0,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,DISEÑO GEOMÉTRICO TERCER CARRIL BOGOTÁ - GIRARDOT,321501,Mejoramiento,Fase III - Diseños a detalle,Rural,Cundinamarca,2017.0,2017.0,1.0,Montañoso,...,0.0,308441600.0,7280632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,BUGA - BUENAVENTURA,654801,,Fase III - Diseños a detalle,Rural,Valle del Cauca,,,1.0,,...,301868900.0,424590300.0,317056300.0,0.0,0.0,0.0,44662460.0,0.0,0.0,204030100.0
9,QUEREMAL - DANUBIO,581301,Mejoramiento,Fase III - Diseños a detalle,Rural,Valle del Cauca,2022.0,2023.0,1.0,Montañoso,...,17902720.0,26669370.0,200028500.0,0.0,0.0,14281512.0,89739030.0,0.0,13020779.0,0.0
