**Enunciado:** Realizar una herramienta ETL que permita obtener el datamart ventas por internet (Internet Sales) y ventas por revendedores (Reseller Sales) a partir de la base de datos operacional. Documentar las dimensiones que participan y las tablas de hechos (50%). (Octubre 22)

* ¿Cuáles son las dimensiones que participan?

* ¿Cuál es el nivel de granularidad de los datos?

* ¿Cuáles son los indicadores (Medidas) de cada tabla de hechos?

In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect
import yaml
import os
import numpy as np

In [2]:
config_path = os.path.join(os.getcwd(), "config.yml")

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
    config_oltp = config['OLTP']
    config_olap = config['OLAP']

url_oltp = (f"mssql+pyodbc://{config_oltp['user']}:{config_oltp['password']}@{config_oltp['host']},{config_oltp['port']}/{config_oltp['dbname']}"
          f"?driver={config_oltp['drivername'].replace(' ', '+')}")

url_olap = (f"mssql+pyodbc://{config_olap['user']}:{config_olap['password']}@{config_olap['host']},{config_olap['port']}/{config_olap['dbname']}"
           f"?driver={config_olap['drivername'].replace(' ', '+')}")
oltp = create_engine(url_oltp)
olap = create_engine(url_olap)

Leer las tablas

In [3]:
def cargaSegura(engine, schema, table):
    inspector = inspect(engine)

    # Obtener columnas
    columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
    columnas_problematicas = []

    # Intentar cargar tabla completa
    try:
        return pd.read_sql_table(table_name=table, con=engine, schema=schema)
    except Exception:
        pass

    # Detectar columnas problemáticas
    for col in columnas:
        try:
            pd.read_sql_query(
                f'SELECT TOP 10 "{col}" FROM "{schema}"."{table}"',
                con=engine
            )
        except Exception:
            columnas_problematicas.append(col)


    # Columnas buenas
    columnas_ok = [col for col in columnas if col not in columnas_problematicas]

    # Si no hay columnas válidas
    if not columnas_ok:
        print(f"⚠ La tabla {schema}.{table} no tiene columnas convertibles. Retornando dataframe vacío.")
        return pd.DataFrame()

    # Cargar solo columnas válidas
    query = (
        f'SELECT {", ".join([f"""\"{c}\"""" for c in columnas_ok])} '
        f'FROM "{schema}"."{table}"'
    )

    df = pd.read_sql_query(query, con=engine)
    return df


def extractHumanResources(conection):
    tablas = [
        "Shift", "Department", "Employee", "EmployeeDepartmentHistory", "EmployeePayHistory"
    ]
    humanResources = {}
    for tabla in tablas:
        df = cargaSegura(conection, "HumanResources", tabla)
        humanResources[tabla] = df
        
    return humanResources

def extractPerson(conection):
    tablas = [
        "PersonPhone", "PhoneNumberType", "Address", "AddressType",
        "StateProvince", "BusinessEntity", "BusinessEntityAddress", "BusinessEntityContact",
        "ContactType", "CountryRegion", "EmailAddress", "Password", "Person"
    ]
    person = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Person", tabla)
        person[tabla] = df
        
    return person

def extractProduction(conection):
    tablas = [
        "Product", "ScrapReason", "ProductCategory", "ProductCostHistory", "ProductDescription",
        "ProductDocument", "ProductInventory", "ProductListPriceHistory", "ProductModel",
        "ProductModelIllustration", "ProductModelProductDescriptionCulture", "BillOfMaterials",
        "ProductPhoto", "ProductProductPhoto", "TransactionHistory", "ProductReview",
        "TransactionHistoryArchive", "ProductSubcategory", "UnitMeasure", "WorkOrder",
        "Culture", "WorkOrderRouting", "Document", "Illustration", "Location"
    ]
    production = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Production", tabla)
        production[tabla] = df
        
    return production

def extractPurchasing(conection):
    tablas = [
        "ShipMethod", "ProductVendor", "Vendor", "PurchaseOrderDetail", "PurchaseOrderHeader"
    ]
    purchasing = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Purchasing", tabla)
        purchasing[tabla] = df
        
    return purchasing

def extractSales(conection):
    tablas = [
        "CountryRegionCurrency", "CreditCard", "Currency", "CurrencyRate", "Customer",
        "PersonCreditCard", "SalesOrderDetail", "SalesOrderHeader",
        "SalesOrderHeaderSalesReason", "SalesPerson",
        "SalesPersonQuotaHistory", "SalesReason", "SalesTaxRate",
        "SalesTerritory", "SalesTerritoryHistory", "ShoppingCartItem",
        "SpecialOffer", "SpecialOfferProduct", "Store"
    ]
    sales = {}
    for tabla in tablas:
        df = cargaSegura(conection, "Sales", tabla)
        sales[tabla] = df
        
    return sales

In [4]:
def extractEmployeeHierarchy(engine):
    query = """
    SELECT 
        e.BusinessEntityID AS EmployeeID,
        e.NationalIDNumber AS EmployeeNationalIDAlternateKey,
        e.OrganizationNode.ToString() AS OrgNode,
        m.BusinessEntityID AS ParentEmployeeKey,
        m.NationalIDNumber AS ParentEmployeeNationalIDAlternateKey
    FROM HumanResources.Employee e
    LEFT JOIN HumanResources.Employee m
        ON e.OrganizationNode.GetAncestor(1) = m.OrganizationNode;
    """
    return pd.read_sql_query(query, con=engine)

In [5]:
humanResources =  extractHumanResources(oltp)
person = extractPerson(oltp)
production = extractProduction(oltp)
purchasing = extractPurchasing(oltp) #Funciona
sales = extractSales(oltp) #Funciona

  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  columnas = [col["name"] for col in inspector.get_columns(table, schema=schema)]
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.reflect(bind=self.con, only=[table_name], views=True)
  self.meta.

In [6]:
sales["Currency"].describe(include='all')

Unnamed: 0,CurrencyCode,Name,ModifiedDate
count,105,105,105
unique,105,105,
top,AED,Emirati Dirham,
freq,1,1,
mean,,,2008-04-30 00:00:00
min,,,2008-04-30 00:00:00
25%,,,2008-04-30 00:00:00
50%,,,2008-04-30 00:00:00
75%,,,2008-04-30 00:00:00
max,,,2008-04-30 00:00:00


In [7]:
def transformDimCurrency(currency):
    dimCurrency = pd.DataFrame(columns=[
        "CurrencyKey", "CurrencyAlternateKey", "CurrencyName"
    ])
    
    dimCurrency["CurrencyAlternateKey"] = currency["CurrencyCode"] 
    dimCurrency["CurrencyName"] = currency["Name"] 
    dimCurrency["CurrencyKey"] = range(1, len(dimCurrency) + 1)
    
    return dimCurrency

In [8]:
dimCurrency = transformDimCurrency(sales["Currency"])
dimCurrency

Unnamed: 0,CurrencyKey,CurrencyAlternateKey,CurrencyName
0,1,AED,Emirati Dirham
1,2,AFA,Afghani
2,3,ALL,Lek
3,4,AMD,Armenian Dram
4,5,ANG,Netherlands Antillian Guilder
...,...,...,...
100,101,VEB,Bolivar
101,102,VND,Dong
102,103,XOF,CFA Franc BCEAO
103,104,ZAR,Rand


In [9]:
hierarchy = extractEmployeeHierarchy(oltp)
hierarchy.head()

Unnamed: 0,EmployeeID,EmployeeNationalIDAlternateKey,OrgNode,ParentEmployeeKey,ParentEmployeeNationalIDAlternateKey
0,1,295847284,,,
1,2,245797967,/1/,,
2,3,509647174,/1/1/,2.0,245797967.0
3,4,112457891,/1/1/1/,3.0,509647174.0
4,5,695256908,/1/1/2/,3.0,509647174.0


In [10]:
def transformDimEmployee(employee, employeePayHistory, employeeDepartmentHistory, department, salesPerson, person, emailAddress, personPhone, hierarchy):
    dimEmployee = pd.DataFrame(columns=[
        "EmployeeKey", "EmployeeNationalIDAlternateKey", "Title", "HireDate", "BirthDate", "LoginID",
        "MaritalStatus", "SalariedFlag", "Gender",
        "VacationHours", "SickLeaveHours", "CurrentFlag", "SalesPersonFlag", "Status"
    ])

    employeePayHistory = (
        employeePayHistory.sort_values("RateChangeDate")
        .groupby("BusinessEntityID")
        .tail(1)
    )
    
    dimEmployee["EmployeeKey"] = employee["BusinessEntityID"]
    dimEmployee["EmployeeNationalIDAlternateKey"] = employee["NationalIDNumber"]
    dimEmployee["Title"] = employee["JobTitle"]
    dimEmployee["HireDate"] = employee["HireDate"]
    dimEmployee["BirthDate"] = employee["BirthDate"]
    dimEmployee["LoginID"] = employee["LoginID"]
    dimEmployee["MaritalStatus"] = employee["MaritalStatus"]
    dimEmployee["SalariedFlag"] = employee["SalariedFlag"].astype(int)
    dimEmployee["Gender"] = employee["Gender"]
    dimEmployee["VacationHours"] = employee["VacationHours"]
    dimEmployee["SickLeaveHours"] = employee["SickLeaveHours"]
    dimEmployee["CurrentFlag"] = employee["CurrentFlag"].astype(int)

    dimEmployee = dimEmployee.merge(
        hierarchy[["EmployeeID", "ParentEmployeeKey", "ParentEmployeeNationalIDAlternateKey"]],
        left_on="EmployeeKey",
        right_on="EmployeeID",
        how="left"
    ).drop(columns=["EmployeeID"])

    dimEmployee = dimEmployee.merge(
        salesPerson[["BusinessEntityID", "TerritoryID"]],
        left_on="EmployeeKey",
        right_on="BusinessEntityID",
        how="left"
    ).drop(columns=["BusinessEntityID"]) \
     .rename(columns={"TerritoryID": "SalesTerritoryKey"})
    
    dimEmployee["SalesTerritoryKey"] = dimEmployee["SalesTerritoryKey"].fillna(11)

    dimEmployee = dimEmployee.merge(
        person[["BusinessEntityID", "FirstName", "LastName", "MiddleName", "NameStyle"]],
        left_on="EmployeeKey",
        right_on="BusinessEntityID",
        how="left"
    ).drop(columns=["BusinessEntityID"])

    dimEmployee["NameStyle"] = dimEmployee["NameStyle"].astype(int)

    dimEmployee = dimEmployee.merge(
        emailAddress[["BusinessEntityID", "EmailAddress"]],
        left_on="EmployeeKey",
        right_on="BusinessEntityID",
        how="left"
    ).drop(columns=["BusinessEntityID"])

    dimEmployee = dimEmployee.merge( 
        personPhone[["BusinessEntityID", "PhoneNumber"]], 
        left_on="EmployeeKey", 
        right_on="BusinessEntityID", 
        how="left" 
    ).drop(columns=["BusinessEntityID"]) \
     .rename(columns={"PhoneNumber": "Phone"})
    
    dimEmployee = dimEmployee.merge(
        employeePayHistory[["BusinessEntityID", "PayFrequency", "Rate"]],
        left_on="EmployeeKey",
        right_on="BusinessEntityID",
        how="left"
    ).drop(columns=["BusinessEntityID"]) \
     .rename(columns={"Rate": "BaseRate"})

    
    dimEmployee = dimEmployee.merge(
        employeeDepartmentHistory[["BusinessEntityID", "DepartmentID", "StartDate", "EndDate"]],
        left_on="EmployeeKey",
        right_on="BusinessEntityID",
        how="left"
    ).merge(
        department[["DepartmentID", "Name"]],
        left_on="DepartmentID",
        right_on="DepartmentID",
        how="left"
    ).rename(columns={"Name": "DepartmentName"}).drop(columns=["BusinessEntityID", "DepartmentID"])


    dimEmployee["SalesPersonFlag"] = np.where(
        (dimEmployee["DepartmentName"].str.contains("Sales", na=False)) &
        (dimEmployee["Title"] != "Vice President of Engineering"),
        1,
        0
    )

    dimEmployee["Status"] = np.where(
        dimEmployee["EndDate"].isna(),
        "Current",
        None
    )

    column_order = [
        "EmployeeKey", "ParentEmployeeKey", "EmployeeNationalIDAlternateKey", "ParentEmployeeNationalIDAlternateKey", 
        "SalesTerritoryKey", "FirstName", "LastName", "MiddleName", "NameStyle", "Title", "HireDate", "BirthDate", 
        "LoginID", "EmailAddress", "Phone", "MaritalStatus", "SalariedFlag", "Gender", "PayFrequency", "BaseRate", 
        "VacationHours", "SickLeaveHours", "CurrentFlag", "SalesPersonFlag", "DepartmentName", "StartDate", "EndDate","Status"
    ]

    dimEmployee = dimEmployee[column_order]
    dimEmployee["EmployeeKey"] = range(1, len(dimEmployee) + 1)
    lookup = dimEmployee.set_index("EmployeeNationalIDAlternateKey")["EmployeeKey"].to_dict()
    dimEmployee["ParentEmployeeKey"] = dimEmployee["ParentEmployeeNationalIDAlternateKey"].map(lookup)

    ceo_key = dimEmployee.loc[
        dimEmployee["Title"] == "Chief Executive Officer", "EmployeeKey"
    ].iloc[0]
    dimEmployee.loc[
        dimEmployee["ParentEmployeeKey"].isna() & (dimEmployee["EmployeeKey"] != ceo_key),
        ["ParentEmployeeKey", "ParentEmployeeNationalIDAlternateKey"]
    ] = [
        ceo_key,
        dimEmployee.loc[dimEmployee["EmployeeKey"] == ceo_key, "EmployeeNationalIDAlternateKey"].iloc[0]
    ]
    
    return dimEmployee

In [11]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

In [12]:
dim = transformDimEmployee(
    humanResources["Employee"],
    humanResources["EmployeePayHistory"],
    humanResources["EmployeeDepartmentHistory"],
    humanResources["Department"],
    sales["SalesPerson"],
    person["Person"],
    person["EmailAddress"],
    person["PersonPhone"],
    hierarchy
)

In [13]:
import xml.etree.ElementTree as ET
def extraerDemografia(df, xml_col):
    data = []
    
    for xml_str in df[xml_col]:
        try:
            root = ET.fromstring(xml_str)
            row = {child.tag.split('}')[1]: child.text for child in root}
            data.append(row)
        except ET.ParseError:
            # En caso de que haya XML mal formado
            data.append({})
    
    df_parsed = pd.DataFrame(data)
    
    # Columnas numéricas conocidas
    numeric_cols = [
        'TotalPurchaseYTD', 'TotalChildren', 'NumberChildrenAtHome',
        'NumberCarsOwned', 'HomeOwnerFlag'
    ]
    
    for col in numeric_cols:
        if col in df_parsed.columns:
            df_parsed[col] = pd.to_numeric(df_parsed[col], errors='coerce')
    
    # Columnas de fecha conocidas
    date_cols = ['BirthDate', 'DateFirstPurchase']
    
    for col in date_cols:
        if col in df_parsed.columns:
            df_parsed[col] = df_parsed[col].str.replace('Z','', regex=False)  # quitar la Z
            df_parsed[col] = pd.to_datetime(df_parsed[col], errors='coerce', format='%Y-%m-%d')
    
    return df_parsed

In [57]:
def transformDimCustomer(person, sales):
    #Tipos IN = Individual Customer
    dimCustomer = person["Person"][person["Person"]["PersonType"] == 'IN'].copy()
    dimCustomer = dimCustomer.drop(columns=[
        'PersonType', 'EmailPromotion', 'AdditionalContactInfo', 'ModifiedDate', 'rowguid'
    ])
    
    demografia = extraerDemografia(dimCustomer,"Demographics").drop(columns=[
        'TotalPurchaseYTD'
    ])
    demografia = demografia.rename(columns={
        'Education': 'EnglishEducation',
        'Occupation': 'EnglishOccupation',
    })
    
    #Añadir español y francés
    education_map = {
        "Bachelors": {"Spanish": "Licenciatura", "French": "Bac + 4"},
        "Graduate Degree": {"Spanish": "Estudios de postgrado", "French": "Bac + 3"},
        "High School": {"Spanish": "Educación secundaria", "French": "Bac + 2"},
        "Partial College": {"Spanish": "Estudios universitarios (en curso)", "French": "Baccalauréat"},
        "Partial High School": {"Spanish": "Educación secundaria (en curso)", "French": "Niveau bac"}
    }
    occupation_map = {
        "Clerical": {"Spanish": "Administrativo", "French": "Employé"},
        "Management": {"Spanish": "Gestión", "French": "Direction"},
        "Manual": {"Spanish": "Obrero", "French": "Ouvrier"},
        "Professional": {"Spanish": "Profesional", "French": "Cadre"},
        "Skilled Manual": {"Spanish": "Obrero especializado", "French": "Technicien"}
    }
    demografia["EnglishEducation"] = demografia["EnglishEducation"].str.strip()
    demografia["EnglishOccupation"] = demografia["EnglishOccupation"].str.strip()

    demografia["SpanishEducation"] = demografia["EnglishEducation"].map(lambda x: education_map[x]["Spanish"])
    demografia["FrenchEducation"] = demografia["EnglishEducation"].map(lambda x: education_map[x]["French"])
    
    demografia["SpanishOccupation"] = demografia["EnglishOccupation"].map(lambda x: occupation_map[x]["Spanish"])
    demografia["FrenchOccupation"] = demografia["EnglishOccupation"].map(lambda x: occupation_map[x]["French"])
    
    dimCustomer = pd.concat([dimCustomer, demografia], axis=1)
    
    businessEntityAddress = person["BusinessEntityAddress"]
    direccion = person["Address"].drop(columns=['rowguid'])
    customer = sales["Customer"].drop(columns=['rowguid'])
    phone = person["PersonPhone"].drop(columns=['ModifiedDate', 'PhoneNumberTypeID'])
    email = person["EmailAddress"].drop(columns=['EmailAddressID', 'rowguid', 'ModifiedDate'])
    
    dimCustomer = dimCustomer.merge(customer[customer['PersonID'].notna()], left_on='BusinessEntityID', right_on='PersonID', how='inner').drop(columns=['PersonID'])
    dimCustomer = dimCustomer.merge(businessEntityAddress, on='BusinessEntityID', how='left')
    dimCustomer = dimCustomer.merge(direccion, on='AddressID', how='left')
    dimCustomer = dimCustomer.merge(phone, on='BusinessEntityID', how='left')
    dimCustomer = dimCustomer.merge(email, on='BusinessEntityID', how='left')

    dimCustomer['CustomerKey'] = range(11000, 11000 + len(dimCustomer))
    dimCustomer = dimCustomer.merge(
        customer[customer['PersonID'].notna()][['PersonID', 'AccountNumber']],
        left_on='BusinessEntityID',
        right_on='PersonID',
        how='left'
    ).rename(columns={'AccountNumber_y': 'CustomerAlternateKey'})

    
    dimCustomer = dimCustomer.drop(columns=['BusinessEntityID', 'Demographics', 'CustomerID', 'StoreID', 'TerritoryID', 
       'ModifiedDate_x', 'AddressID', 'AddressTypeID', 'PersonID',
       'rowguid', 'ModifiedDate_y', 'City', 'AccountNumber_x',
       'StateProvinceID', 'ModifiedDate',       
    ])
    
    return dimCustomer

In [58]:
dimCustomer = transformDimCustomer(person, sales)

In [59]:
dimCustomer

Unnamed: 0,NameStyle,Title,FirstName,MiddleName,LastName,Suffix,DateFirstPurchase,BirthDate,MaritalStatus,YearlyIncome,Gender,TotalChildren,NumberChildrenAtHome,EnglishEducation,EnglishOccupation,HomeOwnerFlag,NumberCarsOwned,CommuteDistance,SpanishEducation,FrenchEducation,SpanishOccupation,FrenchOccupation,AddressLine1,AddressLine2,PostalCode,PhoneNumber,EmailAddress,CustomerKey,CustomerAlternateKey
0,False,Mr.,David,R.,Robinett,,2001-09-21,1965-12-04,S,25001-50000,F,2.0,2.0,Partial College,Clerical,1.0,1.0,0-1 Miles,Estudios universitarios (en curso),Baccalauréat,Administrativo,Employé,Pappelallee 6667,,42651,238-555-0100,david22@adventure-works.com,11000,AW00011377
1,False,Ms.,Rebecca,A.,Robinson,,2002-06-14,1972-06-01,M,greater than 100000,F,4.0,5.0,High School,Management,1.0,4.0,0-1 Miles,Educación secundaria,Bac + 2,Gestión,Direction,1861 Chinquapin Ct,,3198,648-555-0100,rebecca3@adventure-works.com,11001,AW00011913
2,False,Ms.,Dorothy,B.,Robinson,,2003-08-20,1959-08-08,M,50001-75000,F,4.0,0.0,Graduate Degree,Skilled Manual,1.0,0.0,0-1 Miles,Estudios de postgrado,Bac + 3,Obrero especializado,Technicien,4693 Mills Dr.,,3220,423-555-0100,dorothy3@adventure-works.com,11002,AW00011952
3,False,Ms.,Carol Ann,F.,Rockne,,2003-10-08,1955-11-20,M,50001-75000,F,2.0,0.0,High School,Professional,0.0,2.0,5-10 Miles,Educación secundaria,Bac + 2,Profesional,Cadre,1312 Skycrest Drive,,LA1 1LN,439-555-0100,carolann0@adventure-works.com,11003,AW00020164
4,False,Mr.,Scott,M.,Rodgers,,2003-04-26,1958-01-23,M,greater than 100000,F,1.0,1.0,Graduate Degree,Management,1.0,4.0,1-2 Miles,Estudios de postgrado,Bac + 3,Gestión,Direction,9860 Brookview Drive,,4169,989-555-0100,scott10@adventure-works.com,11004,AW00020211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,False,,Crystal,,Guo,,NaT,NaT,,,,,,,,,,,,,,,988 Mt. Everest Court,,BD1 4SJ,1 (11) 500 555-0171,crystal18@adventure-works.com,29503,AW00019379
18504,False,,Isabella,F,Richardson,,NaT,NaT,,,,,,,,,,,,,,,7413 Alpine Drive,,90505,910-555-0166,isabella91@adventure-works.com,29504,AW00013933
18505,False,,Crystal,S,He,,NaT,NaT,,,,,,,,,,,,,,,4764 East Avenue,,98312,813-555-0148,crystal19@adventure-works.com,29505,AW00024634
18506,False,,Crystal,,Zheng,,NaT,NaT,,,,,,,,,,,,,,,"34334, rue Jean Mermoz",,78000,1 (11) 500 555-0171,crystal20@adventure-works.com,29506,AW00021127


In [67]:
def transformFactInternetSales(product, salesOrderDetail, salesOrderHeader, customer, dimCustomer, dimCurrency, currencyRate, stateProvince, salesTaxRate):
  
  # Start with salesOrderDetail instead of empty DataFrame
  salesOrderDetail = salesOrderDetail.copy()
  salesOrderDetail["SalesOrderLineNumber"] = (
    salesOrderDetail.groupby("SalesOrderID").cumcount() + 1
  )

  # Start building factInternetSales from salesOrderDetail
  factInternetSales = salesOrderDetail[["ProductID", "SalesOrderID", "SpecialOfferID", 
                                         "SalesOrderLineNumber", "OrderQty", "UnitPrice", 
                                         "UnitPriceDiscount", "LineTotal", "CarrierTrackingNumber"]].rename(
    columns={"ProductID": "ProductKey"}
  )

  # Now merge with salesOrderHeader
  factInternetSales = factInternetSales.merge(
        salesOrderHeader[["SalesOrderID", "SalesOrderNumber", "RevisionNumber", "OrderDate", 
                         "DueDate", "ShipDate", "CustomerID", "TerritoryID", 
                         "Freight", "CurrencyRateID"]],
        on="SalesOrderID",
        how="left"
  ).rename(columns={
      "SpecialOfferID": "PromotionKey", 
      "OrderQty": "OrderQuantity", 
      "UnitPriceDiscount": "UnitPriceDiscountPct", 
      "TerritoryID": "SalesTerritoryKey", 
      "LineTotal": "SalesAmount"
  }).drop(columns=["SalesOrderID"])
  
  # Rest of your code remains the same...
  factInternetSales = factInternetSales.merge(
        customer[["CustomerID", "AccountNumber"]],
        on="CustomerID",
        how="left"
    ).drop(columns=["CustomerID"]).merge(
        dimCustomer[["CustomerAlternateKey", "CustomerKey"]],
        left_on="AccountNumber",
        right_on="CustomerAlternateKey",
        how="left"
    ).drop(columns=["CustomerAlternateKey", "AccountNumber"])
  
  factInternetSales = factInternetSales.merge(
        product[["ProductID", "StandardCost"]],
        left_on="ProductKey",
        right_on="ProductID",
        how="left"
    ).rename(columns={"StandardCost": "ProductStandardCost"}) \
     .drop(columns=["ProductID"])
  
  factInternetSales = factInternetSales.merge(
    currencyRate[["CurrencyRateID", "ToCurrencyCode"]],
    on="CurrencyRateID",
    how="left"
  ).drop(columns=["CurrencyRateID"]).merge(
    dimCurrency[["CurrencyAlternateKey", "CurrencyKey"]],
    left_on="ToCurrencyCode",
    right_on="CurrencyAlternateKey",
    how="left"
  ).drop(columns=["CurrencyAlternateKey", "ToCurrencyCode"])

  factInternetSales = factInternetSales.merge(
     stateProvince[["StateProvinceID", "TerritoryID"]],
     left_on="SalesTerritoryKey",
     right_on="TerritoryID",
     how="left"
  ).drop(columns=["TerritoryID"]).merge(
     salesTaxRate[["StateProvinceID", "TaxRate"]],
     on="StateProvinceID",
     how="left"
  ).drop(columns=["StateProvinceID"])

  def transforma_date(date):
    if pd.isna(date):
        return None
    return int(date.strftime("%Y%m%d"))
  
  factInternetSales["OrderDateKey"] = factInternetSales["OrderDate"].apply(transforma_date).astype("Int64")
  factInternetSales["DueDateKey"] = factInternetSales["DueDate"].apply(transforma_date).astype("Int64")
  factInternetSales["ShipDateKey"] = factInternetSales["ShipDate"].apply(transforma_date).astype("Int64")
  
  factInternetSales["ExtendedAmount"] = factInternetSales["UnitPrice"] * factInternetSales["OrderQuantity"]
  factInternetSales["DiscountAmount"] = factInternetSales["ExtendedAmount"] * factInternetSales["UnitPriceDiscountPct"]
  factInternetSales["TotalProductCost"] = factInternetSales["ProductStandardCost"] * factInternetSales["OrderQuantity"]
  factInternetSales["TaxAmt"] = (factInternetSales["ExtendedAmount"] - factInternetSales["DiscountAmount"]) * (factInternetSales["TaxRate"] / 100)

  factInternetSales = factInternetSales.drop(columns=["TaxRate"])
  

  column_order = ["ProductKey", "OrderDateKey", "DueDateKey", "ShipDateKey", "CustomerKey", "PromotionKey", "CurrencyKey",
    "SalesTerritoryKey", "SalesOrderNumber", "SalesOrderLineNumber", "RevisionNumber", "OrderQuantity", 
    "UnitPrice", "ExtendedAmount", "UnitPriceDiscountPct", "DiscountAmount", "ProductStandardCost", "TotalProductCost",
    "SalesAmount", "TaxAmt", "Freight", "CarrierTrackingNumber", "OrderDate", "DueDate", "ShipDate"]
  
  factInternetSales = factInternetSales[column_order]
  
  return factInternetSales

In [68]:
fact = transformFactInternetSales(
    production["Product"],
    sales["SalesOrderDetail"],
    sales["SalesOrderHeader"],
    sales["Customer"],
    dimCustomer,
    dimCurrency,
    sales["CurrencyRate"],
    person["StateProvince"],
    sales["SalesTaxRate"]
)

fact[fact["SalesOrderNumber"] == "SO52813"]
#dimCustomer[dimCustomer["CustomerKey"] == 11146]

Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,PromotionKey,CurrencyKey,SalesTerritoryKey,SalesOrderNumber,SalesOrderLineNumber,RevisionNumber,OrderQuantity,UnitPrice,ExtendedAmount,UnitPriceDiscountPct,DiscountAmount,ProductStandardCost,TotalProductCost,SalesAmount,TaxAmt,Freight,CarrierTrackingNumber,OrderDate,DueDate,ShipDate
679132,922,20130718,20130730,20130725,11146.0,1,41.0,10,SO52813,1,8,1,3.99,3.99,0.0,0.0,1.4923,1.4923,3.99,0.69825,2.7615,,2013-07-18,2013-07-30,2013-07-25
679133,931,20130718,20130730,20130725,11146.0,1,41.0,10,SO52813,2,8,1,21.49,21.49,0.0,0.0,8.0373,8.0373,21.49,3.76075,2.7615,,2013-07-18,2013-07-30,2013-07-25
679134,711,20130718,20130730,20130725,11146.0,1,41.0,10,SO52813,3,8,1,34.99,34.99,0.0,0.0,13.0863,13.0863,34.99,6.12325,2.7615,,2013-07-18,2013-07-30,2013-07-25
679135,713,20130718,20130730,20130725,11146.0,1,41.0,10,SO52813,4,8,1,49.99,49.99,0.0,0.0,38.4923,38.4923,49.99,8.74825,2.7615,,2013-07-18,2013-07-30,2013-07-25
