In [None]:
import pandas as pd

In [None]:
import pip
pip.main(["install", "openpyxl"])

In [None]:
# Load original dataset
file_path = "../datasets/2024-08-01_LMS_data_2023.xlsx"
xls = pd.ExcelFile(file_path)

### Load the different sheets into their respective data frames

In [None]:
# Load relevant sheets
shipment_df = pd.read_excel(xls, 'shipment')

In [None]:
carrier_df = pd.read_excel(xls, 'carrier')
domain_df = pd.read_excel(xls, 'domain')
country_df = pd.read_excel(xls, 'country')

In [None]:
service_df = pd.read_excel(xls, 'service')

In [None]:
customer_df = pd.read_excel(xls, 'customer')

In [None]:
pickupaddress_df = pd.read_excel(xls, 'pickupaddress')
deliveryaddress_df = pd.read_excel(xls, 'deliveryaddress')

### Preprocessing of shipment

In [None]:
# Convert 'shipment_id' to numeric, invalid values (non-numeric) will become NaN
shipment_df['shipment_id'] = pd.to_numeric(shipment_df['shipment_id'], errors='coerce')

# Drop rows where 'shipment_id' is NaN (i.e., rows with non-numerical IDs)
shipment_df_clean = shipment_df.dropna(subset=['shipment_id'])

# (Optional) If you want to cast the shipment_id back to integers:
shipment_df_clean['shipment_id'] = shipment_df_clean['shipment_id'].astype(int)

In [None]:
# Function to check if the date is valid and within the acceptable range
def fix_and_check_date(date_str):
    try:
        date = pd.to_datetime(date_str, errors='raise')  # Attempt to convert the date
        if date.year > 2262:  # If the year is out of bounds (beyond 2262)
            return pd.NaT  # Mark it as NaT (invalid)
        return date
    except:
        return pd.NaT  # Mark as NaT if conversion fails

# Apply the function to your date columns and discard time
for col in ['created_date', 'real_delivery_date', 'real_pickup_date']:
    shipment_df_clean[col] = shipment_df_clean[col].apply(fix_and_check_date).dt.date  # Keep only the date part

# Drop rows where any date column contains NaT (invalid or out-of-bounds dates)
shipment_df_clean.dropna(subset=['created_date', 'real_delivery_date', 'real_pickup_date'], inplace=True)


### Process the shipment data frame

In [None]:
# Merge domain name into the shipment fact table
fact_shipment = shipment_df_clean.merge(domain_df[['domain_id', 'name']], on='domain_id', how='left')
fact_shipment = fact_shipment.rename(columns={'name': 'domain_name',
                                              'bookingstate': 'booking_state'})

In [None]:
# Keep relevant columns and calculate margin
fact_shipment = fact_shipment[['shipment_id', 'customer_price', 'expected_carrier_price', 
                               'final_carrier_price', 'weight', 'shipment_type', 
                               'insurance_type', 'customer_id', 'pickupaddress_id', 
                               'deliveryaddress_id', 'service_id', 'domain_name', 
                               'pickup_date', 'delivery_date', 'real_pickup_date', 
                               'real_delivery_date', 'booking_state', 'lms_plus', 
                               'exworks_id','created_date']]

In [None]:
# Convert both columns to numeric, coercing errors (invalid parsing will be set to NaN)
fact_shipment['customer_price'] = pd.to_numeric(fact_shipment['customer_price'], errors='coerce')
fact_shipment['final_carrier_price'] = pd.to_numeric(fact_shipment['final_carrier_price'], errors='coerce')

# Calculate margin, leaving it as NaN where values are missing
fact_shipment['margin'] = fact_shipment['customer_price'] - fact_shipment['final_carrier_price']

In [None]:
shipment_df = None
shipment_df_clean = None

### Process pickup and delivery address data frames

In [None]:
# Step 1: Create the Pickup Address Dimension
# Merge the domain name into the pickup address dimension
dim_pickup_address = pickupaddress_df.merge(domain_df[['domain_id', 'name']], on='domain_id', how='left')
dim_pickup_address = dim_pickup_address.rename(columns={'name': 'domain_name'})

# Keep relevant columns for pickup address dimension
dim_pickup_address = dim_pickup_address[['pickupaddress_id', 'created_date', 'domain_name', 'country_id', 'postal_code', 'city']]
dim_pickup_address = dim_pickup_address.rename(columns={'pickupaddress_id': 'pickup_address_id'})

# Step 2: Create the Delivery Address Dimension
# Merge the domain name into the delivery address dimension
dim_delivery_address = deliveryaddress_df.merge(domain_df[['domain_id', 'name']], on='domain_id', how='left')
dim_delivery_address = dim_delivery_address.rename(columns={'name': 'domain_name'})

# Keep relevant columns for delivery address dimension
dim_delivery_address = dim_delivery_address[['deliveryaddress_id', 'created_date', 'domain_name', 'country_id', 'postal_code', 'city']]
dim_delivery_address = dim_delivery_address.rename(columns={'deliveryaddress_id': 'delivery_address_id'})

# Rename columns in the shipment fact table for clarity
fact_shipment = fact_shipment.rename(columns={'pickupaddress_id': 'pickup_address_id', 
                                                'deliveryaddress_id': 'delivery_address_id'})

dim_delivery_address['created_date'] = pd.to_datetime(dim_delivery_address['created_date'], errors='coerce').dt.date  # Ensure created_date contains only the date
dim_pickup_address['created_date'] = pd.to_datetime(dim_pickup_address['created_date'], errors='coerce').dt.date  # Ensure created_date contains only the date

### Process customer data frame

In [None]:
branchcode_df = pd.read_excel(xls, 'branchcode')
branchcode_customer_translation_df = pd.read_excel(xls, 'branchcode_customer_translation')

In [None]:
# Step 1: Get main and all industries for each customer
customer_industries = (
    customer_df.merge(branchcode_customer_translation_df, on='customer_id', how='left')
    .merge(branchcode_df[['branchcode_id', 'branch_name', 'root_branch_id']], on='branchcode_id', how='left')
)

# Step 2: Create main and additional industries
customer_industry_info = customer_industries.groupby('customer_id').agg({
    'branch_name': lambda x: [i for i in x if isinstance(i, str)],  # Filter out NaN values
    'root_branch_id': 'first'  
}).reset_index()

customer_industry_info['main_industry'] = customer_industry_info['branch_name'].apply(lambda x: x[0] if x else None)
customer_industry_info['all_industries'] = customer_industry_info['branch_name'].apply(lambda x: '|'.join(x) if x else None)

# Step 3: Create customer dimension with all information
dim_customer = (
    customer_df
    .merge(customer_industry_info[['customer_id', 'main_industry', 'all_industries', 'root_branch_id']], 
           on='customer_id', how='left')
    .merge(domain_df[['domain_id', 'name']], on='domain_id', how='left')
    .rename(columns={
        'name': 'domain_name',
        'sequencenumber': 'sequence_number',
        'structurenumber': 'structure_number'
    })
)

# Step 4: Add master status and format
dim_customer['is_master'] = dim_customer['sequence_number'] == dim_customer['structure_number']
dim_customer = dim_customer[[
    'customer_id', 'created_date', 'domain_name', 
    'main_industry', 'all_industries', 'root_branch_id',
    'segmentation', 'sequence_number', 'structure_number', 
    'is_master'
]]
dim_customer['created_date'] = pd.to_datetime(dim_customer['created_date'], errors='coerce').dt.date

### Process dates data frames

In [None]:
# Step 1: Extract relevant date columns from shipment
date_columns = ['created_date', 'pickup_date', 'real_pickup_date', 'delivery_date', 'real_delivery_date']

# Step 2: Remove time from dates that include time (created_date, real_delivery_date, real_pickup_date)
for col in ['created_date', 'real_delivery_date', 'real_pickup_date']:
    fact_shipment[col] = pd.to_datetime(fact_shipment[col], errors='coerce').dt.date  # Keep only the date component

# Step 3: Process each date column separately (to avoid memory overload during concatenation)
date_dim = pd.DataFrame()  # Initialize an empty DataFrame for the date dimension

for col in date_columns:
    # Convert each date column to datetime and remove invalid dates
    fact_shipment[col] = pd.to_datetime(fact_shipment[col], errors='coerce')
    
    # Combine the current date column into the date dimension, avoiding duplication
    new_dates = fact_shipment[[col]].drop_duplicates().dropna().rename(columns={col: 'full_date'})
    date_dim = pd.concat([date_dim, new_dates]).drop_duplicates().reset_index(drop=True)

# Step 4: Ensure full_date is only date (no time) and create month, quarter, and year columns
date_dim['full_date'] = pd.to_datetime(date_dim['full_date'], errors='coerce').dt.date  # Ensure full_date contains only the date
date_dim['year'] = pd.to_datetime(date_dim['full_date'], errors='coerce').dt.year      # Extract year
date_dim['month'] = pd.to_datetime(date_dim['full_date'], errors='coerce').dt.month    # Extract month
date_dim['quarter'] = pd.to_datetime(date_dim['full_date'], errors='coerce').dt.quarter  # Extract quarter
date_dim['date_id'] = date_dim.index + 1  # Create incremental date IDs

# Step 5: Replace date columns in the shipment table with corresponding date IDs (process one at a time)
for col in date_columns:
    # Ensure both the fact_shipment column and the full_date column are in the same datetime format
    fact_shipment[col] = pd.to_datetime(fact_shipment[col], errors='coerce').dt.date  # Ensure it is a date, not datetime
    date_dim['full_date'] = pd.to_datetime(date_dim['full_date'], errors='coerce').dt.date
    
    # Merge fact_shipment with the date dimension to assign date IDs
    fact_shipment = fact_shipment.merge(date_dim[['full_date', 'date_id']], left_on=col, right_on='full_date', how='left')
    
    # Rename the new column and drop the redundant 'full_date' column
    fact_shipment = fact_shipment.rename(columns={'date_id': f'{col}_id'}).drop(columns=['full_date'])

# Step 6: Drop the original date columns as we now have the date IDs in place
fact_shipment = fact_shipment.drop(columns=date_columns)

### Process service dataframe

In [None]:
# Merge domain name into the service dimension table, adding suffixes to distinguish between columns
dim_service = service_df.merge(domain_df[['domain_id', 'name']], on='domain_id', how='left', suffixes=('_service', '_domain'))

In [None]:
# Rename the domain 'name_domain' column to 'domain_name'
dim_service = dim_service.rename(columns={'name_domain': 'domain_name'})

In [None]:
# Rename the domain 'name_service' column to 'name'
dim_service = dim_service.rename(columns={'name_service': 'name'})

In [None]:
dim_service = dim_service.rename(columns={'servicetype': 'service_type',
                                            'transporttype': 'transport_type'})

In [None]:
# Keep relevant columns
dim_service = dim_service[['service_id', 'created_date', 'name', 'service_type', 'transport_type', 'carrier_id', 'domain_name']]

In [None]:
dim_service['created_date'] = pd.to_datetime(dim_service['created_date'], errors='coerce').dt.date  # Ensure created_date contains only the date

### Process carrier and country dimensions

In [None]:
# Merge domain name into the service dimension table, adding suffixes to distinguish between columns
dim_carrier = carrier_df.merge(domain_df[['domain_id', 'name']], on='domain_id', how='left', suffixes=('_carrier', '_domain'))

In [None]:
dim_carrier = dim_carrier.rename(columns={'name_carrier': 'name',
                                            'name_domain': 'domain_name'})

In [None]:
# Create the carrier dimension table
dim_carrier = dim_carrier[['carrier_id', 'name', 'created_date', 'domain_name']]

In [None]:
dim_carrier['created_date'] = pd.to_datetime(dim_carrier['created_date'], errors='coerce').dt.date  # Ensure created_date contains only the date

In [None]:
dim_country = country_df[['country_id', 'name', 'isocountrycode', 'continent', 'EU']]

In [None]:
dim_country = dim_country.rename(columns={'isocountrycode': 'iso_country_code'})

### Save new start schema dataset

In [None]:
# Export to a new Excel file representing the star schema with domain names
with pd.ExcelWriter('../datasets/star_schema_dataset.xlsx', engine='xlsxwriter') as writer:
    # Fact table
    fact_shipment.to_excel(writer, sheet_name='fact_shipment', index=False)
    
    # Dimension tables
    dim_customer.to_excel(writer, sheet_name='dim_customer', index=False)
    dim_delivery_address.to_excel(writer, sheet_name='dim_delivery_address', index=False)
    dim_pickup_address.to_excel(writer, sheet_name='dim_pickup_address', index=False)
    date_dim.to_excel(writer, sheet_name='dim_date', index=False)
    dim_service.to_excel(writer, sheet_name='dim_service', index=False)
    dim_carrier.to_excel(writer, sheet_name='dim_carrier', index=False)
    dim_country.to_excel(writer, sheet_name='dim_country', index=False)

print("Star schema transformation with domain names included completed and saved to 'star_schema_with_domain_names.xlsx'")