# fact_messaging_accumulating

### Importación de librerías

In [25]:
import yaml
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta, time

### Conexión a base y bodega de datos

In [26]:
with open('config.yaml') as f:
    config = yaml.safe_load(f)
    configFuente = config['fuente']
    configBodega = config['bodega']

urlFuente = f"{configFuente['driver']}://{configFuente['user']}:{configFuente['password']}@{configFuente['host']}:{configFuente['port']}/{configFuente['db']}"
urlBodega = f"{configBodega['driver']}://{configBodega['user']}:{configBodega['password']}@{configBodega['host']}:{configBodega['port']}/{configBodega['db']}"

src = create_engine(urlFuente)
etl = create_engine(urlBodega)

### Extracción y transformación de datos

In [27]:
mensajeria_estadosservicio = pd.read_sql_table('mensajeria_estadosservicio', src)

mensajeria_estadosservicio.drop(columns=["foto", "observaciones", "es_prueba", "foto_binary"], inplace=True)



def llenar_fecha_y_hora(mensajeria_estadosservicio, biblioteca_de_servicios_y_registros, servicio_id, index, dateDW, timeDW):
    dateDB = mensajeria_estadosservicio.iloc[index, 1].strftime("%Y-%m-%d")
    timeDB = mensajeria_estadosservicio.iloc[index, 2].strftime('%H:%M')
    
    biblioteca_de_servicios_y_registros[servicio_id][dateDW] = dateDB
    biblioteca_de_servicios_y_registros[servicio_id][timeDW] = timeDB
        


def llenar_duracion(biblioteca_de_servicios_y_registros, servicio_id, date1, time1, date2, time2, duration):
    datea = biblioteca_de_servicios_y_registros[servicio_id][date1]
    timea = biblioteca_de_servicios_y_registros[servicio_id][time1]

    dateb = biblioteca_de_servicios_y_registros[servicio_id][date2]
    timeb = biblioteca_de_servicios_y_registros[servicio_id][time2]

    if datea and timea and dateb and timeb:
    
        datetimea = datetime.strptime(f"{datea} {timea}", "%Y-%m-%d %H:%M")
        datetimeb = datetime.strptime(f"{dateb} {timeb}", "%Y-%m-%d %H:%M")

        duration_in_minutes = (datetimeb - datetimea).total_seconds() / 60
    
        biblioteca_de_servicios_y_registros[servicio_id][duration] = duration_in_minutes



biblioteca_de_servicios_y_registros = {}

for index, row in mensajeria_estadosservicio.iterrows():
    servicio_id = mensajeria_estadosservicio.iloc[index, 4]
    estado_id = mensajeria_estadosservicio.iloc[index, 3]

    if servicio_id not in biblioteca_de_servicios_y_registros:
        biblioteca_de_servicios_y_registros[servicio_id] = {'key_service': servicio_id, 
                                                            'key_start_date': None, 
                                                            'key_start_time': None, 
                                                            'key_assignment_date': None, 
                                                            'key_assignment_time': None, 
                                                            'key_pick_up_date': None, 
                                                            'key_pick_up_time': None, 
                                                            'key_delivery_date': None, 
                                                            'key_delivery_time': None, 
                                                            'key_closing_date': None, 
                                                            'key_closing_time': None,
                                                            'start_to_assignment_duration': None,
                                                            'assignment_to_pick_up_duration': None,
                                                            'pick_up_to_delivery_duration': None,
                                                            'delivery_to_closing_duration': None,}
    
    if estado_id == 1:
        llenar_fecha_y_hora(mensajeria_estadosservicio, biblioteca_de_servicios_y_registros, servicio_id, index, 'key_start_date', 'key_start_time')

    elif estado_id == 2:
        llenar_fecha_y_hora(mensajeria_estadosservicio, biblioteca_de_servicios_y_registros, servicio_id, index, 'key_assignment_date', 'key_assignment_time')

    elif estado_id == 4:
        llenar_fecha_y_hora(mensajeria_estadosservicio, biblioteca_de_servicios_y_registros, servicio_id, index, 'key_pick_up_date', 'key_pick_up_time')

    elif estado_id == 5:
        llenar_fecha_y_hora(mensajeria_estadosservicio, biblioteca_de_servicios_y_registros, servicio_id, index, 'key_delivery_date', 'key_delivery_time')

    elif estado_id == 6:
        llenar_fecha_y_hora(mensajeria_estadosservicio, biblioteca_de_servicios_y_registros, servicio_id, index, 'key_closing_date', 'key_closing_time')




for servicio_id in biblioteca_de_servicios_y_registros:

    llenar_duracion(biblioteca_de_servicios_y_registros, servicio_id, 'key_start_date', 'key_start_time', 'key_assignment_date', 'key_assignment_time', 'start_to_assignment_duration')
    llenar_duracion(biblioteca_de_servicios_y_registros, servicio_id, 'key_assignment_date', 'key_assignment_time', 'key_pick_up_date', 'key_pick_up_time', 'assignment_to_pick_up_duration')
    llenar_duracion(biblioteca_de_servicios_y_registros, servicio_id, 'key_pick_up_date', 'key_pick_up_time', 'key_delivery_date', 'key_delivery_time', 'pick_up_to_delivery_duration')
    llenar_duracion(biblioteca_de_servicios_y_registros, servicio_id, 'key_delivery_date', 'key_delivery_time', 'key_closing_date', 'key_closing_time', 'delivery_to_closing_duration')



fact_messaging = pd.DataFrame.from_dict(biblioteca_de_servicios_y_registros, orient='index')



fact_messaging['key_start_date'] = pd.to_datetime(fact_messaging['key_start_date'], errors='coerce')
fact_messaging['key_assignment_date'] = pd.to_datetime(fact_messaging['key_assignment_date'], errors='coerce')
fact_messaging['key_pick_up_date'] = pd.to_datetime(fact_messaging['key_pick_up_date'], errors='coerce')
fact_messaging['key_delivery_date'] = pd.to_datetime(fact_messaging['key_delivery_date'], errors='coerce')
fact_messaging['key_closing_date'] = pd.to_datetime(fact_messaging['key_closing_date'], errors='coerce')

fact_messaging['key_start_time'] = pd.to_datetime(fact_messaging['key_start_time'], format='%H:%M', errors='coerce').dt.time
fact_messaging['key_assignment_time'] = pd.to_datetime(fact_messaging['key_assignment_time'], format='%H:%M', errors='coerce').dt.time
fact_messaging['key_pick_up_time'] = pd.to_datetime(fact_messaging['key_pick_up_time'], format='%H:%M', errors='coerce').dt.time
fact_messaging['key_delivery_time'] = pd.to_datetime(fact_messaging['key_delivery_time'], format='%H:%M', errors='coerce').dt.time
fact_messaging['key_closing_time'] = pd.to_datetime(fact_messaging['key_closing_time'], format='%H:%M', errors='coerce').dt.time



In [28]:
fact_messaging['start_to_assignment_duration'].fillna(round(fact_messaging['start_to_assignment_duration'].mean(), 0), inplace=True)
fact_messaging['assignment_to_pick_up_duration'].fillna(round(fact_messaging['assignment_to_pick_up_duration'].mean(), 0), inplace=True)
fact_messaging['pick_up_to_delivery_duration'].fillna(round(fact_messaging['pick_up_to_delivery_duration'].mean(), 0), inplace=True)
fact_messaging['delivery_to_closing_duration'].fillna(round(fact_messaging['delivery_to_closing_duration'].mean(), 0), inplace=True)



def llenar_fecha_y_hora2(fact_messaging, index, duration_base, date_to_modify, date_base, time_to_modify, time_base):
    duration = fact_messaging.loc[index, duration_base]

    days_d = duration // 1440
    minutes_d = duration % 1440

    if days_d == 0:
        fact_messaging.loc[index, date_to_modify] = fact_messaging.loc[index, date_base]

        time_at_base = fact_messaging.loc[index, time_base]
        
        time_at_modify = timedelta(hours=time_at_base.hour, minutes=time_at_base.minute) + timedelta(minutes=minutes_d)

        if isinstance(time_at_modify, timedelta):
            total_seconds = int(time_at_modify.total_seconds())
            hours = (total_seconds // 3600) % 24 
            minutes = (total_seconds % 3600) // 60
            seconds = total_seconds % 60
            
            time_at_modify = time(hour=hours, minute=minutes, second=seconds)

        fact_messaging.loc[index, time_to_modify] = time_at_modify



for index, row in fact_messaging.iterrows():
    for column, value in row.items():
        if pd.isna(value):
            if column == "key_start_date":
                duration = fact_messaging.loc[index, 'start_to_assignment_duration']

                days_d = duration // 1440
                minutes_d = duration % 1440

                if days_d == 0:
                    fact_messaging.loc[index, 'key_start_date'] = fact_messaging.loc[index, 'key_assignment_date']

                    time_at_assignment = fact_messaging.loc[index, 'key_assignment_time']
                    
                    time_at_start = (timedelta(hours=time_at_assignment.hour, minutes=time_at_assignment.minute) - timedelta(minutes=minutes_d))

                    if isinstance(time_at_start, timedelta):
                        total_seconds = int(time_at_start.total_seconds())
                        hours = (total_seconds // 3600) % 24
                        minutes = (total_seconds % 3600) // 60
                        seconds = total_seconds % 60
                        
                        time_at_start = time(hour=hours, minute=minutes, second=seconds)

                    fact_messaging.loc[index, 'key_start_time'] = time_at_start

                
            if column == "key_assignment_date":
                llenar_fecha_y_hora2(fact_messaging, index, 'start_to_assignment_duration', 'key_assignment_date', 'key_start_date', 'key_assignment_time', 'key_start_time')
            
            if column == "key_assignment_time":
                duration = fact_messaging.loc[index, 'start_to_assignment_duration']
                minutes_d = duration % 1440

                time_at_start = fact_messaging.loc[index, 'key_start_time']
                
                time_at_assignment = (timedelta(hours=time_at_start.hour, minutes=time_at_start.minute) + timedelta(minutes=minutes_d))


                if isinstance(time_at_assignment, timedelta):
                    total_seconds = int(time_at_assignment.total_seconds())
                    hours = (total_seconds // 3600) % 24
                    minutes = (total_seconds % 3600) // 60
                    seconds = total_seconds % 60
                    
                    time_at_assignment = time(hour=hours, minute=minutes, second=seconds)

                fact_messaging.loc[index, 'key_start_time'] = time_at_start

                fact_messaging.loc[index, 'key_assignment_time'] = time_at_assignment
                
            if column == "key_pick_up_date":
                llenar_fecha_y_hora2(fact_messaging, index, 'assignment_to_pick_up_duration', 'key_pick_up_date', 'key_assignment_date', 'key_pick_up_time', 'key_assignment_time') 

            if column == "key_delivery_date":
                llenar_fecha_y_hora2(fact_messaging, index, 'pick_up_to_delivery_duration', 'key_delivery_date', 'key_pick_up_date', 'key_delivery_time', 'key_pick_up_time')
            
            if column == "key_closing_date":
                llenar_fecha_y_hora2(fact_messaging, index, 'delivery_to_closing_duration', 'key_closing_date', 'key_delivery_date', 'key_closing_time', 'key_delivery_time')
            
            

dim_fecha = pd.read_sql_table('dim_fecha', etl)
dim_hora = pd.read_sql_table('dim_hora', etl)


for date_col in ['key_start_date', 'key_assignment_date', 'key_pick_up_date', 'key_delivery_date', 'key_closing_date']:
    fact_messaging = fact_messaging.merge(
        dim_fecha[['fecha', 'key_dim_fecha']], 
        left_on=date_col, 
        right_on='fecha', 
        how='left'
    ).drop(columns=[date_col]).rename(columns={'key_dim_fecha': date_col}).drop(columns=['fecha'])
    fact_messaging[date_col] = pd.to_numeric(fact_messaging[date_col], errors='coerce').astype('Int64')


for time_col in ['key_start_time', 'key_assignment_time', 'key_pick_up_time', 'key_delivery_time', 'key_closing_time']:
    fact_messaging = fact_messaging.merge(
        dim_hora[['hora', 'key_dim_hora']], 
        left_on=time_col, 
        right_on='hora', 
        how='left'
    ).drop(columns=[time_col]).rename(columns={'key_dim_hora': time_col}).drop(columns=['hora'])
    fact_messaging[time_col] = pd.to_numeric(fact_messaging[time_col], errors='coerce').astype('Int64')


fact_messaging["key_fact_messaging_accumulating"] = range(1, len(fact_messaging) + 1)

fact_messaging

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fact_messaging['start_to_assignment_duration'].fillna(round(fact_messaging['start_to_assignment_duration'].mean(), 0), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fact_messaging['assignment_to_pick_up_duration'].fillna(round(fact_messaging['assignment_to_pick_up

Unnamed: 0,key_service,start_to_assignment_duration,assignment_to_pick_up_duration,pick_up_to_delivery_duration,delivery_to_closing_duration,key_start_date,key_assignment_date,key_pick_up_date,key_delivery_date,key_closing_date,key_start_time,key_assignment_time,key_pick_up_time,key_delivery_time,key_closing_time,key_fact_messaging_accumulating
0,226,30.0,46.0,26.0,292.0,393,394,394,394,394,1438,28,74,100,392,1
1,79,4477.0,202.0,38825.0,1.0,365,368,368,395,395,821,978,1180,1125,1126,2
2,613,7.0,18.0,34.0,12.0,402,402,402,402,402,636,643,661,695,707,3
3,376,1078.0,0.0,0.0,292.0,396,397,397,397,397,1253,891,891,891,1183,4
4,7164,69.0,6.0,81.0,292.0,462,462,462,462,462,962,1031,1037,1118,1410,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28425,28464,37.0,13.0,19.0,292.0,609,609,609,609,609,769,806,819,838,1130,28426
28426,28465,10.0,52.0,49.0,292.0,609,609,609,609,609,812,822,874,923,1215,28427
28427,28466,60.0,1.0,104.0,292.0,609,609,609,609,609,844,904,905,1009,1301,28428
28428,28467,49.0,90.0,104.0,292.0,609,609,609,609,609,853,902,992,1096,1388,28429


### Carga de datos

In [29]:
fact_messaging.to_sql("hecho_entrega_mensajeria_acumulada", etl, index=False, if_exists="replace")

430