In [5]:
import pandas as pd
from sqlalchemy import create_engine, inspect
import yaml
import os
import xml.etree.ElementTree as ET
import numpy as np

config_path = os.path.join(os.getcwd(), "config.yml")

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
    config_oltp = config['OLTP']
    config_olap = config['OLAP']

url_oltp = (f"mssql+pyodbc://{config_oltp['user']}:{config_oltp['password']}@{config_oltp['host']},{config_oltp['port']}/{config_oltp['dbname']}"
          f"?driver={config_oltp['drivername'].replace(' ', '+')}")

url_olap = (f"mssql+pyodbc://{config_olap['user']}:{config_olap['password']}@{config_olap['host']},{config_olap['port']}/{config_olap['dbname']}"
           f"?driver={config_olap['drivername'].replace(' ', '+')}")
oltp = create_engine(url_oltp)
olap = create_engine(url_olap)

def cargaSegura(engine, schema, table):
    inspector = inspect(engine)

    # Obtener columnas
    columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
    columnas_problematicas = []

    # Intentar cargar tabla completa
    try:
        return pd.read_sql_table(table_name=table, con=engine, schema=schema)
    except Exception:
        pass

    # Detectar columnas problemáticas
    for col in columnas:
        try:
            pd.read_sql_query(
                f'SELECT TOP 10 "{col}" FROM "{schema}"."{table}"',
                con=engine
            )
        except Exception:
            columnas_problematicas.append(col)


    # Columnas buenas
    columnas_ok = [col for col in columnas if col not in columnas_problematicas]

    # Si no hay columnas válidas
    if not columnas_ok:
        print(f"⚠ La tabla {schema}.{table} no tiene columnas convertibles. Retornando dataframe vacío.")
        return pd.DataFrame()

    # Cargar solo columnas válidas
    query = (
        f'SELECT {", ".join([f"""\"{c}\"""" for c in columnas_ok])} '
        f'FROM "{schema}"."{table}"'
    )

    df = pd.read_sql_query(query, con=engine)
    return df


def extractHumanResources(conection):
    tablas = [
        "Shift", "Department", "Employee", "EmployeeDepartmentHistory", "EmployeePayHistory"
    ]
    humanResources = {}
    for tabla in tablas:
        df = cargaSegura(conection, "HumanResources", tabla)
        humanResources[tabla] = df
        
    return humanResources

def extractPerson(conection):
    tablas = [
        "PersonPhone", "PhoneNumberType", "Address", "AddressType",
        "StateProvince", "BusinessEntity", "BusinessEntityAddress", "BusinessEntityContact",
        "ContactType", "CountryRegion", "EmailAddress", "Password", "Person"
    ]
    person = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Person", tabla)
        person[tabla] = df
        
    return person

def extractProduction(conection):
    tablas = [
        "Product", "ScrapReason", "ProductCategory", "ProductCostHistory", "ProductDescription",
        "ProductDocument", "ProductInventory", "ProductListPriceHistory", "ProductModel",
        "ProductModelIllustration", "ProductModelProductDescriptionCulture", "BillOfMaterials",
        "ProductPhoto", "ProductProductPhoto", "TransactionHistory", "ProductReview",
        "TransactionHistoryArchive", "ProductSubcategory", "UnitMeasure", "WorkOrder",
        "Culture", "WorkOrderRouting", "Document", "Illustration", "Location"
    ]
    production = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Production", tabla)
        production[tabla] = df
        
    return production

def extractPurchasing(conection):
    tablas = [
        "ShipMethod", "ProductVendor", "Vendor", "PurchaseOrderDetail", "PurchaseOrderHeader"
    ]
    purchasing = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Purchasing", tabla)
        purchasing[tabla] = df
        
    return purchasing

def extractSales(conection):
    tablas = [
        "CountryRegionCurrency", "CreditCard", "Currency", "CurrencyRate", "Customer",
        "PersonCreditCard", "SalesOrderDetail", "SalesOrderHeader",
        "SalesOrderHeaderSalesReason", "SalesPerson",
        "SalesPersonQuotaHistory", "SalesReason", "SalesTaxRate",
        "SalesTerritory", "SalesTerritoryHistory", "ShoppingCartItem",
        "SpecialOffer", "SpecialOfferProduct", "Store"
    ]
    sales = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Sales", tabla)
        sales[tabla] = df
        
    return sales
  
humanResources =  extractHumanResources(oltp)
person = extractPerson(oltp)
production = extractProduction(oltp)
purchasing = extractPurchasing(oltp) #Funciona
sales = extractSales(oltp) #Funciona

  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.

In [6]:
sales["Currency"].describe(include='all')

Unnamed: 0,CurrencyCode,Name,ModifiedDate
count,105,105,105
unique,105,105,
top,AED,Emirati Dirham,
freq,1,1,
mean,,,2008-04-30 00:00:00
min,,,2008-04-30 00:00:00
25%,,,2008-04-30 00:00:00
50%,,,2008-04-30 00:00:00
75%,,,2008-04-30 00:00:00
max,,,2008-04-30 00:00:00


In [7]:
def generate_unique_ip(keys, base_ip="198.51"):
    ips = []
    for k in keys:
        # Cada "bloque" de 253 IPs usa el siguiente tercer octeto
        block = (k - 1) // 253
        last_octet = ((k - 1) % 253) + 2
        third_octet = 100 + block  # puedes ajustar 100 a cualquier valor inicial
        ip = f"{base_ip}.{third_octet}.{last_octet}"
        ips.append(ip)
    return ips

def extraerDemografia(df, xml_col):
    data = []
    
    for xml_str in df[xml_col]:
        try:
            root = ET.fromstring(xml_str)
            row = {child.tag.split('}')[1]: child.text for child in root}
            data.append(row)
        except ET.ParseError:
            # En caso de que haya XML mal formado
            data.append({})
    
    df_parsed = pd.DataFrame(data)
    
    # Columnas numéricas conocidas
    numeric_cols = [
        'TotalPurchaseYTD', 'TotalChildren', 'NumberChildrenAtHome',
        'NumberCarsOwned', 'HomeOwnerFlag'
    ]
    
    for col in numeric_cols:
        if col in df_parsed.columns:
            df_parsed[col] = pd.to_numeric(df_parsed[col], errors='coerce')
    
    # Columnas de fecha conocidas
    date_cols = ['BirthDate', 'DateFirstPurchase']
    
    for col in date_cols:
        if col in df_parsed.columns:
            df_parsed[col] = df_parsed[col].str.replace('Z','', regex=False)  # quitar la Z
            df_parsed[col] = pd.to_datetime(df_parsed[col], errors='coerce', format='%Y-%m-%d')
    
    return df_parsed
  
def transformDimCustomer(person, sales):
    #Tipos IN = Individual Customer
    dimCustomer = person["Person"][person["Person"]["PersonType"] == 'IN'].copy()
    dimCustomer = dimCustomer.drop(columns=[
        'PersonType', 'EmailPromotion', 'AdditionalContactInfo', 'ModifiedDate', 'rowguid'
    ])
    
    demografia = extraerDemografia(dimCustomer,"Demographics").drop(columns=[
        'TotalPurchaseYTD'
    ])
    demografia = demografia.rename(columns={
        'Education': 'EnglishEducation',
        'Occupation': 'EnglishOccupation',
    })
    
    #Añadir español y francés
    education_map = {
        "Bachelors": {"Spanish": "Licenciatura", "French": "Bac + 4"},
        "Graduate Degree": {"Spanish": "Estudios de postgrado", "French": "Bac + 3"},
        "High School": {"Spanish": "Educación secundaria", "French": "Bac + 2"},
        "Partial College": {"Spanish": "Estudios universitarios (en curso)", "French": "Baccalauréat"},
        "Partial High School": {"Spanish": "Educación secundaria (en curso)", "French": "Niveau bac"}
    }
    occupation_map = {
        "Clerical": {"Spanish": "Administrativo", "French": "Employé"},
        "Management": {"Spanish": "Gestión", "French": "Direction"},
        "Manual": {"Spanish": "Obrero", "French": "Ouvrier"},
        "Professional": {"Spanish": "Profesional", "French": "Cadre"},
        "Skilled Manual": {"Spanish": "Obrero especializado", "French": "Technicien"}
    }
    demografia["EnglishEducation"] = demografia["EnglishEducation"].str.strip()
    demografia["EnglishOccupation"] = demografia["EnglishOccupation"].str.strip()

    demografia["SpanishEducation"] = demografia["EnglishEducation"].map(lambda x: education_map[x]["Spanish"])
    demografia["FrenchEducation"] = demografia["EnglishEducation"].map(lambda x: education_map[x]["French"])
    
    demografia["SpanishOccupation"] = demografia["EnglishOccupation"].map(lambda x: occupation_map[x]["Spanish"])
    demografia["FrenchOccupation"] = demografia["EnglishOccupation"].map(lambda x: occupation_map[x]["French"])
    
    dimCustomer = pd.concat([dimCustomer, demografia], axis=1)
    
    businessEntityAddress = person["BusinessEntityAddress"]
    direccion = person["Address"].drop(columns=['rowguid'])
    customer = sales["Customer"].drop(columns=['rowguid'])
    phone = person["PersonPhone"].drop(columns=['ModifiedDate', 'PhoneNumberTypeID'])
    email = person["EmailAddress"].drop(columns=['EmailAddressID', 'rowguid', 'ModifiedDate'])
    
    dimCustomer = dimCustomer.merge(customer[customer['PersonID'].notna()], left_on='BusinessEntityID', right_on='PersonID', how='inner').drop(columns=['PersonID'])
    dimCustomer = dimCustomer.merge(businessEntityAddress, on='BusinessEntityID', how='left')
    dimCustomer = dimCustomer.merge(direccion, on='AddressID', how='left')
    dimCustomer = dimCustomer.merge(phone, on='BusinessEntityID', how='left')
    dimCustomer = dimCustomer.merge(email, on='BusinessEntityID', how='left')

    dimCustomer['CustomerKey'] = range(11000, 11000 + len(dimCustomer))
    dimCustomer = dimCustomer.merge(
        customer[customer['PersonID'].notna()][['PersonID', 'AccountNumber']],
        left_on='BusinessEntityID',
        right_on='PersonID',
        how='left'
    ).rename(columns={'AccountNumber_y': 'CustomerAlternateKey'})

    
    dimCustomer = dimCustomer.drop(columns=['BusinessEntityID', 'Demographics', 'CustomerID', 'StoreID', 'TerritoryID', 
       'ModifiedDate_x', 'AddressTypeID', 'PersonID',
       'rowguid', 'ModifiedDate_y', 'AccountNumber_x',
       'ModifiedDate',       
    ])
    
    return dimCustomer

def transformDimCurrency(currency):
    dimCurrency = pd.DataFrame(columns=[
        "CurrencyKey", "CurrencyAlternateKey", "CurrencyName"
    ])
    
    dimCurrency["CurrencyAlternateKey"] = currency["CurrencyCode"] 
    dimCurrency["CurrencyName"] = currency["Name"] 
    dimCurrency["CurrencyKey"] = range(1, len(dimCurrency) + 1)
    
    return dimCurrency

dimCurrency = transformDimCurrency(sales["Currency"])
dimCustomer = transformDimCustomer(person, sales)

In [8]:
def transformDimReseller(customer, salesOrderHeader, personPhone, personAddress, personBusinessEntityAddress, demographics, dimGeography, stateProvince):
    dimReseller = pd.DataFrame(columns=[
        "ResellerKey", "ResellerAlternateKey", 
         "OrderFrequency", "OrderMonth", "FirstOrderYear", "LastOrderYear", "IDStore"
    ])

    #demographics = utils_etl.extractStoreDemographics(oltp)

    # Este es para usarlo solo para sacar el CustomerID que va a SalesOrderHeader

    customersNoNulos = customer[
        customer["PersonID"].notna() & customer["StoreID"].notna()
    ].copy()  

    # Renombrar CustomerID a CustomerStoreID
    customersNoNulos = customersNoNulos.rename(columns={"CustomerID": "CustomerStoreID"})

    ####

    customer = customer[customer["StoreID"].notna()]


    dimReseller["ResellerKey"] = customer["CustomerID"]
    dimReseller["ResellerAlternateKey"] = customer["AccountNumber"]
    dimReseller["IDStore"] = customer["StoreID"]


    # Datos que se pueden traer desde demographics
    dimReseller = dimReseller.merge(
        demographics[["BusinessEntityID", "ResellerName", "BusinessType", "NumberEmployees", "AnnualSales", "BankName", "AnnualRevenue", "YearOpened", "ProductLine"]],
        left_on="IDStore",
        right_on="BusinessEntityID",
        how="left"
    ).drop(columns=["BusinessEntityID"])

    # Teléfono
    dimReseller = dimReseller.merge(
        personPhone[["BusinessEntityID", "PhoneNumber"]],
        left_on=dimReseller["IDStore"] - 1, # PersonID es StoreID - 1
        right_on="BusinessEntityID",
        how="left"
    ).drop(columns=["BusinessEntityID"]) \
     .rename(columns={"PhoneNumber": "Phone"})
    
    # Direccion
    dimReseller = dimReseller.merge(
        personBusinessEntityAddress[["BusinessEntityID", "AddressID"]],
        left_on=dimReseller["IDStore"],
        right_on="BusinessEntityID",
        how="left"
    ).drop(columns=["BusinessEntityID"])

    dimReseller = dimReseller.merge(
        personAddress[["AddressID", "AddressLine1", "AddressLine2", "PostalCode",  "City", "StateProvinceID"]],
        on="AddressID",
        how="left"
    )

    # GeographyKey
    # dimReseller = dimReseller.merge(
    #     dimGeography[["GeographyKey", "PostalCode"]],
    #     left_on="PostalCodeReseller",
    #     right_on="PostalCode",
    #     how="left"
    # )

    dimReseller = dimReseller.merge(
        stateProvince[["StateProvinceID", "StateProvinceCode", "CountryRegionCode"]],
        on="StateProvinceID",
        how="left"
    )

    dimReseller = dimReseller.merge(
        dimGeography[["GeographyKey", "PostalCode", "City", "StateProvinceCode", "CountryRegionCode"]],
        on=["PostalCode", "City", "StateProvinceCode", "CountryRegionCode"],
        how="left"
    )

    # Tipo de negocio 
    codeBusiness = {"BM": "Value Added Reseller", "BS": "Specialty Bike Shop", "OS": "Warehouse"}
    dimReseller["BusinessType"] = dimReseller["BusinessType"].map(codeBusiness)

    # Orders
    dimReseller = dimReseller.merge(
        customersNoNulos[["CustomerStoreID", "StoreID"]],
        left_on=dimReseller["IDStore"],
        right_on="StoreID",
        how="left"
    )
    

    dimReseller = dimReseller.merge(
        salesOrderHeader[["CustomerID", "OrderDate"]],
        left_on=dimReseller["CustomerStoreID"],
        right_on="CustomerID",
        how="left"
    )

    order_counts = dimReseller.groupby("CustomerStoreID")["OrderDate"].count()
    dimReseller["OrderFrequency"] = dimReseller["CustomerStoreID"].map(order_counts)
    dimReseller["OrderMonth"] = dimReseller["OrderDate"].dt.month
    dimReseller["FirstOrderYear"] = dimReseller.groupby("CustomerStoreID")["OrderDate"].transform("min").dt.year
    dimReseller["LastOrderYear"]  = dimReseller.groupby("CustomerStoreID")["OrderDate"].transform("max").dt.year

    # Frecuency
    
    conditions = [
    dimReseller["OrderFrequency"] >= 20,
    dimReseller["OrderFrequency"] >= 10
    ]

    values = ["A", "Q"]

    dimReseller["OrderFrequency"] = np.select(conditions, values, default="S" )


    # Pasar las columnas a int
    cols_int = ["NumberEmployees", "YearOpened",  "OrderMonth", "FirstOrderYear", "LastOrderYear"]

    for c in cols_int:
        dimReseller[c] = dimReseller[c].astype("Int64")
    
    column_order = ["ResellerKey", "GeographyKey", "ResellerAlternateKey", "Phone", "BusinessType", "ResellerName", 
                    "NumberEmployees", "OrderFrequency", "OrderMonth", "FirstOrderYear", "LastOrderYear", 
                    "ProductLine", "AddressLine1", "AddressLine2", "AnnualSales", "BankName", 
                    "AnnualRevenue", "YearOpened"]
    # Eliminar columnas que no están en column_order
    for col in list(dimReseller.columns):
        if col not in column_order:
            dimReseller = dimReseller.drop(columns=[col])
            
    dimReseller = dimReseller[column_order]
    dimReseller = dimReseller.drop_duplicates(subset=["ResellerKey"])
    
    # DataFrame de ejemplo para unknown Reseller
    unknown_reseller = pd.DataFrame({
        "ResellerKey": [0],
        "GeographyKey": [0],  # Asegúrate de tener GeographyKey=0 en DimGeography también
        "ResellerAlternateKey": ["UNKNOWN"],
        "Phone": ["Unknown"],
        "BusinessType": ["Unknown"],
        "ResellerName": ["Unknown"],
        "NumberEmployees": [0],
        "OrderFrequency": ["U"],  # U = Unknown
        "OrderMonth": [0],
        "FirstOrderYear": [0],
        "LastOrderYear": [0],
        "ProductLine": ["Unknown"],
        "AddressLine1": ["Unknown"],
        "AddressLine2": ["Unknown"],
        "AnnualSales": [0.0],
        "BankName": ["Unknown"],
        "AnnualRevenue": [0.0],
        "YearOpened": [0]
    })

    # Concatenar con dimReseller existente
    dimReseller = pd.concat([dimReseller, unknown_reseller], ignore_index=True)

    return dimReseller

def extractStoreDemographics(engine):
    query = """
    WITH XMLNAMESPACES (
        'http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/StoreSurvey' AS ss
    )
    SELECT 
        s.BusinessEntityID AS BusinessEntityID,
        s.Name AS ResellerName,
        s.SalesPersonID AS StorePersonID,

        s.Demographics.value('(ss:StoreSurvey/ss:YearOpened)[1]', 'int') AS YearOpened,
        s.Demographics.value('(ss:StoreSurvey/ss:AnnualSales)[1]', 'money') AS AnnualSales,
        s.Demographics.value('(ss:StoreSurvey/ss:AnnualRevenue)[1]', 'money') AS AnnualRevenue,
        s.Demographics.value('(ss:StoreSurvey/ss:NumberEmployees)[1]', 'int') AS NumberEmployees,
        s.Demographics.value('(ss:StoreSurvey/ss:BankName)[1]', 'nvarchar(100)') AS BankName,
        s.Demographics.value('(ss:StoreSurvey/ss:BusinessType)[1]', 'nvarchar(20)') AS BusinessType,
        s.Demographics.value('(ss:StoreSurvey/ss:Specialty)[1]', 'nvarchar(50)') AS ProductLine

    FROM Sales.Store s;
    """
    
    return pd.read_sql_query(query, con=engine)

def transformDimGeography(sales, person):
    dimGeography = sales["SalesTerritory"].drop(columns=[
        'SalesYTD', 'CostYTD', 'CostLastYear', 'rowguid', 'ModifiedDate', 'SalesLastYear',
        'Group', 'Name'
    ]).drop_duplicates()
    
    countryNameMap = {
        "US" : "United States", "CA" : "Canada", "FR" : "France", "DE" : "Germany",
        "AU" : "Australia", "GB" : "United Kingdom"
    }
    
    countryMap = {"Australia": {"Spanish": "Australia", "French": "Australie"},
    "Canada": {"Spanish": "Canada", "French": "Canada"},
    "Germany": {"Spanish": "Alemania", "French": "Allemagne"},
    "France": {"Spanish": "Francia", "French": "France"},
    "United Kingdom": {"Spanish": "Reino Unido", "French": "Royaume-Uni"},
    "United States": {"Spanish": "Estados Unidos", "French": "États-Unis"}}
    
    dimGeography["EnglishCountryRegionName"] = dimGeography["CountryRegionCode"].map(lambda x: countryNameMap[x])
    dimGeography["SpanishCountryRegionName"] = dimGeography["EnglishCountryRegionName"].map(lambda x: countryMap[x]["Spanish"])
    dimGeography["FrenchCountryRegionName"] = dimGeography["EnglishCountryRegionName"].map(lambda x: countryMap[x]["French"])
    
    province = person["StateProvince"].drop(columns=[
        'CountryRegionCode', 'IsOnlyStateProvinceFlag', 'rowguid', 'ModifiedDate'
    ]).drop_duplicates()
    
    dimGeography = dimGeography.merge(province, on='TerritoryID', how='right')
    dimGeography = dimGeography.rename(columns={'Name':'StateProvinceName'})
    
    city = person["Address"].drop(columns=[
        'AddressID', 'AddressLine1', 'AddressLine2', 'rowguid', 'ModifiedDate'
    ]).drop_duplicates()
    
    dimGeography = dimGeography.merge(city, on='StateProvinceID', how='right')
    dimGeography = dimGeography.rename(columns={'TerritoryID':'SalesTerritoryKey'})
    dimGeography["GeographyKey"] = range(1, len(dimGeography) + 1)
    dimGeography["IpAddressLocator"] = generate_unique_ip(dimGeography["GeographyKey"])
    dimGeography = dimGeography.drop(columns=['StateProvinceID'])
    
    # Creamos la fila "Unknown"
    unknown_geography = pd.DataFrame({
        "GeographyKey": [0],
        "City": ["Unknown"],
        "StateProvinceCode": ["UNK"],
        "StateProvinceName": ["Unknown"],
        "CountryRegionCode": ["UNK"],
        "EnglishCountryRegionName": ["Unknown"],
        "SpanishCountryRegionName": ["Unknown"],
        "FrenchCountryRegionName": ["Unknown"],
        "PostalCode": ["Unknown"],
        "SalesTerritoryKey": [0],  # Asegúrate de tener SalesTerritoryKey=0 en DimSalesTerritory
        "IpAddressLocator": ["Unknown"]
    })

    # Concatenar con dimGeography existente
    dimGeography = pd.concat([dimGeography, unknown_geography], ignore_index=True)

    return dimGeography
  
dimGeography = transformDimGeography(sales,person)
  
dimReseller = transformDimReseller(
    sales["Customer"], 
    sales["SalesOrderHeader"], 
    person["PersonPhone"], 
    person["Address"], 
    person["BusinessEntityAddress"], 
    extractStoreDemographics(oltp),
    dimGeography.copy(),
    person["StateProvince"]
)

In [None]:
def transformFactResellerSales(product, salesOrderDetail, salesOrderHeader, dimCurrency, currencyRate, dimReseller, dimEmployee):
    salesOrderDetail = salesOrderDetail.copy()
    salesOrderDetail["SalesOrderLineNumber"] = (
        salesOrderDetail.groupby("SalesOrderID").cumcount() + 1
    )

    # Start building factResellerSales from salesOrderDetail
    factResellerSales = salesOrderDetail[["ProductID", "SalesOrderID", "SpecialOfferID", 
                                            "SalesOrderLineNumber", "OrderQty", "UnitPrice", 
                                            "UnitPriceDiscount", "LineTotal", "CarrierTrackingNumber"]].rename(
        columns={"ProductID": "ProductKey"}
    )


    # Now merge with salesOrderHeader
    factResellerSales = factResellerSales.merge(
        salesOrderHeader[["SalesOrderID", "SalesOrderNumber", "RevisionNumber", "OrderDate", 
                            "DueDate", "ShipDate", "CustomerID", "TerritoryID", 
                            "Freight", "CurrencyRateID", "TaxAmt"]],
        on="SalesOrderID",
        how="left"
    ).rename(columns={
        "SpecialOfferID": "PromotionKey", 
        "OrderQty": "OrderQuantity", 
        "UnitPriceDiscount": "UnitPriceDiscountPct", 
        "TerritoryID": "SalesTerritoryKey", 
        "LineTotal": "SalesAmount"
    }).drop(columns=["SalesOrderID"])

    
    factResellerSales = factResellerSales.merge(
        dimReseller[["ResellerKey", "ResellerAlternateKey"]],
        left_on="CustomerID",
        right_on="ResellerKey",
        how="left"
    ).drop(columns=["CustomerID"])


    factResellerSales = factResellerSales.merge(
        product[["ProductID", "StandardCost"]],
        left_on="ProductKey",
        right_on="ProductID",
        how="left"
    ).rename(columns={"StandardCost": "ProductStandardCost"}) \
    .drop(columns=["ProductID"])
    
    factResellerSales = factResellerSales.merge(
        currencyRate[["CurrencyRateID", "ToCurrencyCode"]],
        on="CurrencyRateID",
        how="left"
    ).drop(columns=["CurrencyRateID"]).merge(
        dimCurrency[["CurrencyAlternateKey", "CurrencyKey"]],
        left_on="ToCurrencyCode",
        right_on="CurrencyAlternateKey",
        how="left"
    ).drop(columns=["CurrencyAlternateKey", "ToCurrencyCode"])

    factResellerSales = factResellerSales.merge(
        dimEmployee[["EmployeeKey", "EmployeeNationalIDAlternateKey"]],
        left_on="SalesPersonID",    # columna en fact table
        right_on="EmployeeNationalIDAlternateKey",  # columna en dimEmployee
        how="left"
    ).drop(columns=["EmployeeNationalIDAlternateKey"])


    def transforma_date(date):
        if pd.isna(date):
            return None
        return int(date.strftime("%Y%m%d"))
    
    factResellerSales["OrderDateKey"] = factResellerSales["OrderDate"].apply(transforma_date).astype("Int64")
    factResellerSales["DueDateKey"] = factResellerSales["DueDate"].apply(transforma_date).astype("Int64")
    factResellerSales["ShipDateKey"] = factResellerSales["ShipDate"].apply(transforma_date).astype("Int64")
    
    factResellerSales["ExtendedAmount"] = factResellerSales["UnitPrice"] * factResellerSales["OrderQuantity"]
    factResellerSales["DiscountAmount"] = factResellerSales["ExtendedAmount"] * factResellerSales["UnitPriceDiscountPct"]
    factResellerSales["TotalProductCost"] = factResellerSales["ProductStandardCost"] * factResellerSales["OrderQuantity"]

    factResellerSales["CurrencyKey"] = factResellerSales["CurrencyKey"].fillna(0).astype(int)
    factResellerSales["ResellerKey"] = factResellerSales["ResellerKey"].fillna(0).astype(int)

    column_order = ["ProductKey", "OrderDateKey", "DueDateKey", "ShipDateKey", "ResellerKey", "EmployeeKey", "PromotionKey", "CurrencyKey",
        "SalesTerritoryKey", "SalesOrderNumber", "SalesOrderLineNumber", "RevisionNumber", "OrderQuantity", 
        "UnitPrice", "ExtendedAmount", "UnitPriceDiscountPct", "DiscountAmount", "ProductStandardCost", "TotalProductCost",
        "SalesAmount", "TaxAmt", "Freight", "CarrierTrackingNumber", "OrderDate", "DueDate", "ShipDate"]
    
    factResellerSales = factResellerSales[column_order]
    
    return factResellerSales

In [14]:
factResellerSales = transformFactResellerSales(        
    production["Product"], 
    sales["SalesOrderDetail"], 
    sales["SalesOrderHeader"], 
    dimCurrency.copy(), 
    sales["CurrencyRate"], 
    dimReseller.copy()
)

In [15]:
#Todo hasta freight debe ser not null
#Tiene NaN CustomerKey, CurrencyKey, TaxAmt
print(factResellerSales.columns)
factResellerSales.isna().sum()

Index(['ProductKey', 'OrderDateKey', 'DueDateKey', 'ShipDateKey',
       'PromotionKey', 'CurrencyKey', 'SalesTerritoryKey', 'SalesOrderNumber',
       'SalesOrderLineNumber', 'RevisionNumber', 'OrderQuantity', 'UnitPrice',
       'ExtendedAmount', 'UnitPriceDiscountPct', 'DiscountAmount',
       'ProductStandardCost', 'TotalProductCost', 'SalesAmount', 'TaxAmt',
       'Freight', 'CarrierTrackingNumber', 'OrderDate', 'DueDate', 'ShipDate'],
      dtype='object')


ProductKey               0
OrderDateKey             0
DueDateKey               0
ShipDateKey              0
PromotionKey             0
CurrencyKey              0
SalesTerritoryKey        0
SalesOrderNumber         0
SalesOrderLineNumber     0
RevisionNumber           0
OrderQuantity            0
UnitPrice                0
ExtendedAmount           0
UnitPriceDiscountPct     0
DiscountAmount           0
ProductStandardCost      0
TotalProductCost         0
SalesAmount              0
TaxAmt                   0
Freight                  0
CarrierTrackingNumber    0
OrderDate                0
DueDate                  0
ShipDate                 0
dtype: int64

In [16]:
# Filas donde CustomerKey es NaN
nan_customers = factInternetSales[factInternetSales["CustomerKey"].isna()]
no_nan_customers = factInternetSales[factInternetSales["CustomerKey"].notna()]

print(f"Total filas con CustomerKey NaN: {len(nan_customers)}")
print(f"Total filas sin CustomerKey NaN: {len(no_nan_customers)}")

# Número de CustomerID únicos en salesOrderHeader
num_unique_customerID = sales["SalesOrderHeader"]["CustomerID"].nunique()
print(f"CustomerID únicos en salesOrderHeader: {num_unique_customerID}")

# Número de filas en customer
num_rows_customer = len(dimCustomer)
print(f"Filas totales en customer: {num_rows_customer}")

NameError: name 'factInternetSales' is not defined