In [4]:
# Import required libraries
import pandas as pd
import numpy as np

In [5]:
# Load all tables from the Excel file
excel_file = '../datasets/star_schema_dataset.xlsx'

In [6]:
# Read fact and dimension tables
fact_shipment = pd.read_excel(excel_file, sheet_name='fact_shipment')
dim_customer = pd.read_excel(excel_file, sheet_name='dim_customer')
dim_delivery_address = pd.read_excel(excel_file, sheet_name='dim_delivery_address')
dim_pickup_address = pd.read_excel(excel_file, sheet_name='dim_pickup_address')
dim_date = pd.read_excel(excel_file, sheet_name='dim_date')
dim_service = pd.read_excel(excel_file, sheet_name='dim_service')
dim_carrier = pd.read_excel(excel_file, sheet_name='dim_carrier')
dim_country = pd.read_excel(excel_file, sheet_name='dim_country')

### Initial briefing

In [7]:
# Print dimensions of each table for verification
print("Initial table dimensions:")
print(f"Fact Shipment: {fact_shipment.shape}")
print(f"Customer: {dim_customer.shape}")
print(f"Delivery Address: {dim_delivery_address.shape}")
print(f"Pickup Address: {dim_pickup_address.shape}")
print(f"Date: {dim_date.shape}")
print(f"Service: {dim_service.shape}")
print(f"Carrier: {dim_carrier.shape}")
print(f"Country: {dim_country.shape}\n")

Initial table dimensions:
Fact Shipment: (656802, 21)
Customer: (7935, 9)
Delivery Address: (712272, 6)
Pickup Address: (712272, 6)
Date: (617, 5)
Service: (2119, 7)
Carrier: (237, 4)
Country: (200, 5)



In [8]:
fact_shipment.columns

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'pickup_address_id', 'delivery_address_id', 'service_id',
       'domain_name', 'booking_state', 'lms_plus', 'exworks_id', 'margin',
       'created_date_id', 'pickup_date_id', 'real_pickup_date_id',
       'delivery_date_id', 'real_delivery_date_id'],
      dtype='object')

### Some preprocessing

Since the name column will have to be renamed each time for clarity, we do it to the original dimension of country.

In [9]:
dim_country = dim_country.rename(columns={'name': 'name_country'})

In [10]:
dim_country.columns

Index(['country_id', 'name_country', 'iso_country_code', 'continent', 'EU'], dtype='object')

### Shipment and customer merging

In [11]:
print(f"Shipment columns: {fact_shipment.columns}")
print(f"customer columns: {dim_customer.columns}")


Shipment columns: Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'pickup_address_id', 'delivery_address_id', 'service_id',
       'domain_name', 'booking_state', 'lms_plus', 'exworks_id', 'margin',
       'created_date_id', 'pickup_date_id', 'real_pickup_date_id',
       'delivery_date_id', 'real_delivery_date_id'],
      dtype='object')
customer columns: Index(['customer_id', 'created_date', 'domain_name', 'main_industry_name',
       'industry_sector_name', 'segmentation', 'sequence_number',
       'structure_number', 'is_master'],
      dtype='object')


In [12]:
print("Merging customer dimension...")

# Create a dictionary to rename all customer columns except customer_id
customer_rename = {
    col: f"{col}_customer" 
    for col in dim_customer.columns 
    if col != 'customer_id'
}

# Rename customer columns
dim_customer_renamed = dim_customer.copy()
dim_customer_renamed = dim_customer_renamed.rename(columns=customer_rename)

# Merge with fact_shipment
df = fact_shipment.merge(
    dim_customer_renamed,
    on='customer_id',
    how='left'
)

print(f"Shape after customer merge: {df.shape}\n")

Merging customer dimension...
Shape after customer merge: (656802, 29)



In [13]:
df.columns

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'pickup_address_id', 'delivery_address_id', 'service_id',
       'domain_name', 'booking_state', 'lms_plus', 'exworks_id', 'margin',
       'created_date_id', 'pickup_date_id', 'real_pickup_date_id',
       'delivery_date_id', 'real_delivery_date_id', 'created_date_customer',
       'domain_name_customer', 'main_industry_name_customer',
       'industry_sector_name_customer', 'segmentation_customer',
       'sequence_number_customer', 'structure_number_customer',
       'is_master_customer'],
      dtype='object')

### Delivery, country and shipment mergin

In [14]:
print(f"Country columns: {dim_country.columns}")
print(f"Delivery columns: {dim_delivery_address.columns}")


Country columns: Index(['country_id', 'name_country', 'iso_country_code', 'continent', 'EU'], dtype='object')
Delivery columns: Index(['delivery_address_id', 'created_date', 'domain_name', 'country_id',
       'postal_code', 'city'],
      dtype='object')


In [15]:
print("Merging delivery address and its country...")

# Then merge delivery address with renamed country
delivery_with_country = dim_delivery_address.merge(
    dim_country,
    on='country_id',
    how='left'
)

delivery_with_country = delivery_with_country.drop('country_id', axis=1)

Merging delivery address and its country...


In [16]:
delivery_with_country.columns

Index(['delivery_address_id', 'created_date', 'domain_name', 'postal_code',
       'city', 'name_country', 'iso_country_code', 'continent', 'EU'],
      dtype='object')

In [17]:
# Create dictionary to rename all columns except delivery_address_id
delivery_rename = {
    col: f"{col}_delivery" 
    for col in delivery_with_country.columns 
    if col != 'delivery_address_id'
}

# Rename the columns
delivery_with_country = delivery_with_country.rename(columns=delivery_rename)
print(delivery_with_country.columns)


Index(['delivery_address_id', 'created_date_delivery', 'domain_name_delivery',
       'postal_code_delivery', 'city_delivery', 'name_country_delivery',
       'iso_country_code_delivery', 'continent_delivery', 'EU_delivery'],
      dtype='object')


In [18]:
# Then merge the result with the main dataframe
df = df.merge(
    delivery_with_country,
    on='delivery_address_id',
    how='left'
)

df = df.drop('delivery_address_id', axis=1)
print(f"Shape after delivery address merges: {df.shape}\n")

Shape after delivery address merges: (656802, 36)



### Pickup, country and df merge

In [19]:
print(f"Country columns: {dim_country.columns}")
print(f"Pickup columns: {dim_pickup_address.columns}")

Country columns: Index(['country_id', 'name_country', 'iso_country_code', 'continent', 'EU'], dtype='object')
Pickup columns: Index(['pickup_address_id', 'created_date', 'domain_name', 'country_id',
       'postal_code', 'city'],
      dtype='object')


In [20]:
print("Merging pickup address and its country...")

# Then merge pickup address with renamed country
pickup_with_country = dim_pickup_address.merge(
    dim_country,
    on='country_id',
    how='left'
)

pickup_with_country = pickup_with_country.drop('country_id', axis=1)

Merging pickup address and its country...


In [21]:
pickup_with_country.columns

Index(['pickup_address_id', 'created_date', 'domain_name', 'postal_code',
       'city', 'name_country', 'iso_country_code', 'continent', 'EU'],
      dtype='object')

In [22]:
# Create dictionary to rename all columns except delivery_address_id
pickup_rename = {
    col: f"{col}_pickup" 
    for col in pickup_with_country.columns 
    if col != 'pickup_address_id'
}

# Rename the columns
pickup_with_country = pickup_with_country.rename(columns=pickup_rename)


In [23]:
print(pickup_with_country.columns)

Index(['pickup_address_id', 'created_date_pickup', 'domain_name_pickup',
       'postal_code_pickup', 'city_pickup', 'name_country_pickup',
       'iso_country_code_pickup', 'continent_pickup', 'EU_pickup'],
      dtype='object')


In [24]:
# Then merge the result with the main dataframe
df = df.merge(
    pickup_with_country,
    on='pickup_address_id',
    how='left'
)

df = df.drop('pickup_address_id', axis=1)
print(f"Shape after delivery address merges: {df.shape}\n")

Shape after delivery address merges: (656802, 43)



In [25]:
df.columns

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'service_id', 'domain_name', 'booking_state', 'lms_plus',
       'exworks_id', 'margin', 'created_date_id', 'pickup_date_id',
       'real_pickup_date_id', 'delivery_date_id', 'real_delivery_date_id',
       'created_date_customer', 'domain_name_customer',
       'main_industry_name_customer', 'industry_sector_name_customer',
       'segmentation_customer', 'sequence_number_customer',
       'structure_number_customer', 'is_master_customer',
       'created_date_delivery', 'domain_name_delivery', 'postal_code_delivery',
       'city_delivery', 'name_country_delivery', 'iso_country_code_delivery',
       'continent_delivery', 'EU_delivery', 'created_date_pickup',
       'domain_name_pickup', 'postal_code_pickup', 'city_pickup',
       'name_country_pickup', 'iso_country_code_pickup', 'continent_pickup',
       'EU_pickup'],
  

### Service and carrier dimensions merging

In [26]:
print(f"Service columns: {dim_service.columns}")
print(f"carrier columns: {dim_carrier.columns}")

Service columns: Index(['service_id', 'created_date', 'name', 'service_type', 'transport_type',
       'carrier_id', 'domain_name'],
      dtype='object')
carrier columns: Index(['carrier_id', 'name', 'created_date', 'domain_name'], dtype='object')


In [27]:
carrier_rename = {
   col: f"{col}_carrier" 
   for col in dim_carrier.columns 
   if col != 'carrier_id'
}

In [28]:
print(carrier_rename)

{'name': 'name_carrier', 'created_date': 'created_date_carrier', 'domain_name': 'domain_name_carrier'}


In [29]:
dim_carrier = dim_carrier.rename(columns=carrier_rename)

In [30]:
service_rename = {
   col: f"{col}_service" 
   for col in dim_service.columns 
   if col not in ['service_id', 'service_type', 'transport_type', 'carrier_id']
}

In [31]:
print(service_rename)

{'created_date': 'created_date_service', 'name': 'name_service', 'domain_name': 'domain_name_service'}


In [32]:
dim_service = dim_service.rename(columns=service_rename)

In [33]:
service_carrier = dim_service.merge(
   dim_carrier,
   on='carrier_id',
   how='left'
)

In [34]:
print(service_carrier.columns)

Index(['service_id', 'created_date_service', 'name_service', 'service_type',
       'transport_type', 'carrier_id', 'domain_name_service', 'name_carrier',
       'created_date_carrier', 'domain_name_carrier'],
      dtype='object')


In [35]:
service_carrier = service_carrier.drop('carrier_id', axis=1)

In [36]:
df = df.merge(
   service_carrier,
   on='service_id',
   how='left'
)

df = df.drop('service_id', axis=1)
print(f"Shape after service and carrier merges: {df.shape}\n")

Shape after service and carrier merges: (656802, 50)



### Date dimensions merging

In [37]:
print(dim_date.columns)

Index(['full_date', 'year', 'month', 'quarter', 'date_id'], dtype='object')


In [38]:
# Merge date dimensions
print("Merging date dimensions...")
for date_type in ['created_date', 'pickup_date', 'delivery_date', 'real_pickup_date', 'real_delivery_date']:
    date_id_col = f'{date_type}_id'
    if date_id_col in df.columns:
        df = df.merge(
            dim_date,
            left_on=date_id_col,
            right_on='date_id',
            how='left',
            suffixes=('', f'_{date_type}')
        )
        # Rename date columns to avoid confusion
        df = df.rename(columns={
            'year': f'year_{date_type}',
            'month': f'month_{date_type}',
            'quarter': f'quarter_{date_type}',
            'full_date': f'full_date_{date_type}'
        })
print(f"Shape after date merges: {df.shape}\n")

Merging date dimensions...
Shape after date merges: (656802, 75)



In [39]:
print(df.columns)

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'domain_name', 'booking_state', 'lms_plus', 'exworks_id',
       'margin', 'created_date_id', 'pickup_date_id', 'real_pickup_date_id',
       'delivery_date_id', 'real_delivery_date_id', 'created_date_customer',
       'domain_name_customer', 'main_industry_name_customer',
       'industry_sector_name_customer', 'segmentation_customer',
       'sequence_number_customer', 'structure_number_customer',
       'is_master_customer', 'created_date_delivery', 'domain_name_delivery',
       'postal_code_delivery', 'city_delivery', 'name_country_delivery',
       'iso_country_code_delivery', 'continent_delivery', 'EU_delivery',
       'created_date_pickup', 'domain_name_pickup', 'postal_code_pickup',
       'city_pickup', 'name_country_pickup', 'iso_country_code_pickup',
       'continent_pickup', 'EU_pickup', 'created_date_service', 

In [40]:
# Remove only true ID columns that aren't needed for analysis
columns_to_drop = [
    'date_id',
    'created_date_id', 'pickup_date_id', 'delivery_date_id',
    'real_pickup_date_id', 'real_delivery_date_id',
    'date_id_pickup_date', 'date_id_delivery_date', 
    'date_id_real_pickup_date', 'date_id_real_delivery_date'
]

# Drop ID columns
df = df.drop(columns=columns_to_drop, errors='ignore')

In [41]:
print(df.columns)

Index(['shipment_id', 'customer_price', 'expected_carrier_price',
       'final_carrier_price', 'weight', 'shipment_type', 'insurance_type',
       'customer_id', 'domain_name', 'booking_state', 'lms_plus', 'exworks_id',
       'margin', 'created_date_customer', 'domain_name_customer',
       'main_industry_name_customer', 'industry_sector_name_customer',
       'segmentation_customer', 'sequence_number_customer',
       'structure_number_customer', 'is_master_customer',
       'created_date_delivery', 'domain_name_delivery', 'postal_code_delivery',
       'city_delivery', 'name_country_delivery', 'iso_country_code_delivery',
       'continent_delivery', 'EU_delivery', 'created_date_pickup',
       'domain_name_pickup', 'postal_code_pickup', 'city_pickup',
       'name_country_pickup', 'iso_country_code_pickup', 'continent_pickup',
       'EU_pickup', 'created_date_service', 'name_service', 'service_type',
       'transport_type', 'domain_name_service', 'name_carrier',
       'created_

### Saving the dataset

In [42]:
# Save the flattened dataset with proper formatting
output_file = '../datasets/flattened_dataset.csv'

# Save to CSV
df.to_csv(output_file, index=False)