In [1]:
# Import required libraries
import pandas as pd
import numpy as np

In [2]:
# Load all tables from the Excel file
excel_file = '../datasets/star_schema_dataset.xlsx'

In [3]:
# Read fact and dimension tables
fact_shipment = pd.read_excel(excel_file, sheet_name='fact_shipment')
dim_customer = pd.read_excel(excel_file, sheet_name='dim_customer')
dim_delivery_address = pd.read_excel(excel_file, sheet_name='dim_delivery_address')
dim_pickup_address = pd.read_excel(excel_file, sheet_name='dim_pickup_address')
dim_date = pd.read_excel(excel_file, sheet_name='dim_date')
dim_service = pd.read_excel(excel_file, sheet_name='dim_service')
dim_carrier = pd.read_excel(excel_file, sheet_name='dim_carrier')
dim_country = pd.read_excel(excel_file, sheet_name='dim_country')

In [4]:
# Print dimensions of each table for verification
print("Initial table dimensions:")
print(f"Fact Shipment: {fact_shipment.shape}")
print(f"Customer: {dim_customer.shape}")
print(f"Delivery Address: {dim_delivery_address.shape}")
print(f"Pickup Address: {dim_pickup_address.shape}")
print(f"Date: {dim_date.shape}")
print(f"Service: {dim_service.shape}")
print(f"Carrier: {dim_carrier.shape}")
print(f"Country: {dim_country.shape}\n")

Initial table dimensions:
Fact Shipment: (656802, 21)
Customer: (7935, 10)
Delivery Address: (712272, 6)
Pickup Address: (712272, 6)
Date: (617, 5)
Service: (2119, 7)
Carrier: (237, 4)
Country: (200, 5)



In [5]:
# Start with customer dimension
print("Merging customer dimension...")
df = fact_shipment.merge(
    dim_customer,
    on='customer_id',
    how='left',
    suffixes=('', '_customer')
)
print(f"Shape after customer merge: {df.shape}\n")

Merging customer dimension...
Shape after customer merge: (656802, 30)



In [6]:
# Merge delivery address and its country
print("Merging delivery address and its country...")
df = df.merge(
    dim_delivery_address,
    on='delivery_address_id',
    how='left',
    suffixes=('', '_delivery')
).merge(
    dim_country,
    left_on='country_id',
    right_on='country_id',
    how='left',
    suffixes=('_delivery', '_delivery_country')
)
print(f"Shape after delivery address merges: {df.shape}\n")

Merging delivery address and its country...
Shape after delivery address merges: (656802, 39)



In [7]:
# Merge pickup address and its country
print("Merging pickup address and its country...")
df = df.merge(
    dim_pickup_address,
    on='pickup_address_id',
    how='left',
    suffixes=('', '_pickup')
).merge(
    dim_country,
    left_on='country_id',
    right_on='country_id',
    how='left',
    suffixes=('_pickup', '_pickup_country')
)
print(f"Shape after pickup address merges: {df.shape}\n")

Merging pickup address and its country...
Shape after pickup address merges: (656802, 48)



In [8]:
# Merge service and carrier dimensions
print("Merging service and carrier dimensions...")
df = df.merge(
    dim_service,
    on='service_id',
    how='left',
    suffixes=('', '_service')
).merge(
    dim_carrier,
    on='carrier_id',
    how='left',
    suffixes=('', '_carrier')
)
print(f"Shape after service and carrier merges: {df.shape}\n")

Merging service and carrier dimensions...
Shape after service and carrier merges: (656802, 57)



In [9]:
# Merge date dimensions
print("Merging date dimensions...")
for date_type in ['created_date', 'pickup_date', 'delivery_date', 'real_pickup_date', 'real_delivery_date']:
    date_id_col = f'{date_type}_id'
    if date_id_col in df.columns:
        df = df.merge(
            dim_date,
            left_on=date_id_col,
            right_on='date_id',
            how='left',
            suffixes=('', f'_{date_type}')
        )
        # Rename date columns to avoid confusion
        df = df.rename(columns={
            'year': f'year_{date_type}',
            'month': f'month_{date_type}',
            'quarter': f'quarter_{date_type}',
            'full_date': f'full_date_{date_type}'
        })
print(f"Shape after date merges: {df.shape}\n")

Merging date dimensions...
Shape after date merges: (656802, 82)



In [10]:
print(df.columns)

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'pickup_address_id', 'delivery_address_id', 'service_id',
       'domain_name', 'booking_state', 'lms_plus', 'exworks_id', 'margin',
       'created_date_id', 'pickup_date_id', 'real_pickup_date_id',
       'delivery_date_id', 'real_delivery_date_id', 'created_date',
       'domain_name_customer', 'main_industry', 'all_industries',
       'root_branch_id', 'segmentation', 'sequence_number', 'structure_number',
       'is_master', 'created_date_delivery', 'domain_name_delivery',
       'country_id', 'postal_code', 'city', 'name_pickup',
       'iso_country_code_pickup', 'continent_pickup', 'EU_pickup',
       'created_date_pickup', 'domain_name_pickup', 'country_id_pickup',
       'postal_code_pickup', 'city_pickup', 'name_pickup_country',
       'iso_country_code_pickup_country', 'continent_pickup_country',
       'EU_pickup_

In [11]:
# Remove only true ID columns that aren't needed for analysis
columns_to_drop = [
    'pickup_address_id', 'delivery_address_id', 
    'service_id', 'carrier_id', 'country_id', 'country_id_pickup', 'date_id',
    'exworks_id', 'created_date_id', 'pickup_date_id', 'delivery_date_id',
    'real_pickup_date_id', 'real_delivery_date_id',
    'date_id_pickup_date', 'date_id_delivery_date', 
    'date_id_real_pickup_date', 'date_id_real_delivery_date'
]

# Drop ID columns
df = df.drop(columns=columns_to_drop, errors='ignore')

In [12]:
print(df.columns)

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'domain_name', 'booking_state', 'lms_plus', 'margin',
       'created_date', 'domain_name_customer', 'main_industry',
       'all_industries', 'root_branch_id', 'segmentation', 'sequence_number',
       'structure_number', 'is_master', 'created_date_delivery',
       'domain_name_delivery', 'postal_code', 'city', 'name_pickup',
       'iso_country_code_pickup', 'continent_pickup', 'EU_pickup',
       'created_date_pickup', 'domain_name_pickup', 'postal_code_pickup',
       'city_pickup', 'name_pickup_country', 'iso_country_code_pickup_country',
       'continent_pickup_country', 'EU_pickup_country', 'created_date_service',
       'name', 'service_type', 'transport_type', 'domain_name_service',
       'name_carrier', 'created_date_carrier', 'domain_name_carrier',
       'full_date_created_date', 'year_created_date', 'month_cre

In [13]:
# Rename columns for clarity while preserving distinct information
column_renames = {
    'postal_code': 'delivery_postal_code',
    'postal_code_pickup': 'pickup_postal_code',
    'city': 'delivery_city',
    'city_pickup': 'pickup_city',
    'name': 'service_name',
    'name_carrier': 'carrier_name'
}

df = df.rename(columns=column_renames)

In [14]:
print(df.columns)

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'domain_name', 'booking_state', 'lms_plus', 'margin',
       'created_date', 'domain_name_customer', 'main_industry',
       'all_industries', 'root_branch_id', 'segmentation', 'sequence_number',
       'structure_number', 'is_master', 'created_date_delivery',
       'domain_name_delivery', 'delivery_postal_code', 'delivery_city',
       'name_pickup', 'iso_country_code_pickup', 'continent_pickup',
       'EU_pickup', 'created_date_pickup', 'domain_name_pickup',
       'pickup_postal_code', 'pickup_city', 'name_pickup_country',
       'iso_country_code_pickup_country', 'continent_pickup_country',
       'EU_pickup_country', 'created_date_service', 'service_name',
       'service_type', 'transport_type', 'domain_name_service', 'carrier_name',
       'created_date_carrier', 'domain_name_carrier', 'full_date_created_date',
     

In [15]:
# Remove any remaining duplicate columns (if any exist)
df = df.loc[:, ~df.columns.duplicated()]

In [16]:
print("Final columns after cleanup:")
print(df.columns)

Final columns after cleanup:
Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'domain_name', 'booking_state', 'lms_plus', 'margin',
       'created_date', 'domain_name_customer', 'main_industry',
       'all_industries', 'root_branch_id', 'segmentation', 'sequence_number',
       'structure_number', 'is_master', 'created_date_delivery',
       'domain_name_delivery', 'delivery_postal_code', 'delivery_city',
       'name_pickup', 'iso_country_code_pickup', 'continent_pickup',
       'EU_pickup', 'created_date_pickup', 'domain_name_pickup',
       'pickup_postal_code', 'pickup_city', 'name_pickup_country',
       'iso_country_code_pickup_country', 'continent_pickup_country',
       'EU_pickup_country', 'created_date_service', 'service_name',
       'service_type', 'transport_type', 'domain_name_service', 'carrier_name',
       'created_date_carrier', 'domain_name_carrier', 'f

In [17]:
# Save the flattened dataset with proper formatting
output_file = '../datasets/flattened_dataset.csv'

# Save to CSV
df.to_csv(output_file, index=False)