In [None]:
import pandas as pd

In [None]:
excel_file = '../datasets/star_schema_dataset.xlsx'

# Read the fact and dimension tables into separate pandas DataFrames
fact_shipment = pd.read_excel(excel_file, sheet_name='fact_shipment')
dim_customer = pd.read_excel(excel_file, sheet_name='dim_customer')
dim_delivery_address = pd.read_excel(excel_file, sheet_name='dim_delivery_address')
dim_pickup_address = pd.read_excel(excel_file, sheet_name='dim_pickup_address')
dim_date = pd.read_excel(excel_file, sheet_name='dim_date')
dim_service = pd.read_excel(excel_file, sheet_name='dim_service')
dim_carrier = pd.read_excel(excel_file, sheet_name='dim_carrier')
dim_country = pd.read_excel(excel_file, sheet_name='dim_country')

In [None]:
# Print the shape of each DataFrame to verify the data
print("Fact Shipment:", fact_shipment.shape)
print("Customer:", dim_customer.shape)
print("Delivery Address:", dim_delivery_address.shape)
print("Pickup Address:", dim_pickup_address.shape)
print("Date:", dim_date.shape)
print("Service:", dim_service.shape)
print("Carrier:", dim_carrier.shape)
print("Country:", dim_country.shape)

## Carrier Analysis

In [None]:
# Print the columns in the carrier dimension
print("Columns in the carrier dimension:")
print(dim_carrier.columns)

# Print the first 10 rows of the carrier dimension
print("\nFirst 10 rows of the carrier dimension:")
print(dim_carrier.head(10))

# Print summary statistics for the carrier dimension
print("\nSummary statistics for the carrier dimension:")
print(dim_carrier.describe())

In [None]:
print(f"Number of unique carriers: {dim_carrier['carrier_id'].nunique()}")

In [None]:
# Analyze the distribution of carriers by domain
print("\nCarriers by Domain:")
carrier_by_domain = dim_carrier.groupby('domain_name')['carrier_id'].nunique().sort_values(ascending=False)
print(carrier_by_domain)

In [None]:
# Visualize the top domains by number of carriers
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
carrier_by_domain.head(10).plot(kind='bar')
plt.title('Top Domains by Number of Carriers')
plt.xlabel('Domain')
plt.ylabel('Number of Carriers')
plt.show()

In [None]:
# Check for carriers with the same name but different IDs
duplicate_carriers = dim_carrier.groupby('name')['carrier_id'].nunique()
duplicate_carriers = duplicate_carriers[duplicate_carriers > 1]

if not duplicate_carriers.empty:
    print("Carriers with the same name but different IDs:")
    print(duplicate_carriers)
else:
    print("No carriers with the same name but different IDs found.")

In [None]:
# Check for carriers with the same name and domain but different IDs
duplicate_carriers = dim_carrier.groupby(['name', 'domain_name'])['carrier_id'].nunique()
duplicate_carriers = duplicate_carriers[duplicate_carriers > 1]

if not duplicate_carriers.empty:
    print("Carriers with the same name and domain but different IDs:")
    print(duplicate_carriers)
else:
    print("No carriers with the same name and domain but different IDs found.")

## Service analysis

In [None]:
# Print the columns in the service dimension
print("Columns in the service dimension:")
print(dim_service.columns)

# Print the first 10 rows of the service dimension
print("\nFirst 10 rows of the service dimension:")
print(dim_service.head(10))

# Print summary statistics for the service dimension
print("\nSummary statistics for the service dimension:")
print(dim_service.describe())

In [None]:
# Check the number of unique services
print(f"Number of unique services: {dim_service['service_id'].nunique()}")

In [None]:
# Analyze the distribution of service types and transport types
print("\nDistribution of Service Types:")
print(dim_service['service_type'].value_counts())

print("\nDistribution of Transport Types:")
print(dim_service['transport_type'].value_counts())

In [None]:
# Analyze the distribution of services by domain
service_by_domain = dim_service.groupby('domain_name')['service_id'].nunique().sort_values(ascending=False)
print("\nServices by Domain:")
print(service_by_domain)

In [None]:
# Check if different service IDs have the same service name
duplicate_service_names = dim_service.groupby('name')['service_id'].nunique()
duplicate_service_names = duplicate_service_names[duplicate_service_names > 1]

if not duplicate_service_names.empty:
    print("Service names with multiple service IDs:")
    print(duplicate_service_names)
else:
    print("No service names with multiple service IDs found.")

In [None]:
# Check for duplicate columns in dim_service (excluding service_id and created_date)
duplicate_columns = dim_service.loc[:, ~dim_service.columns.isin(['service_id', 'created_date'])].columns[dim_service.loc[:, ~dim_service.columns.isin(['service_id', 'created_date'])].columns.duplicated()]

if len(duplicate_columns) > 0:
    print("Duplicate columns in dim_service (excluding service_id and created_date):")
    print(duplicate_columns)
else:
    print("No duplicate columns found in dim_service (excluding service_id and created_date).")

## Customer Analysis

In [None]:
# Print the columns in the customer dimension
print("Columns in the customer dimension:")
print(dim_customer.columns)

# Print the summary statistics for the customer dimension
print("\nSummary statistics for the customer dimension:")
print(dim_customer.describe())

In [None]:
# Customer Count and Uniqueness
print(f"Number of unique customers: {dim_customer['customer_id'].nunique()}")

In [None]:
# Customer Segmentation
print("Customer Segment Distribution:")
print(dim_customer['segmentation'].value_counts())

In [None]:
# Customer Domain Distribution
print("Customer Domain Distribution:")
print(dim_customer.groupby('domain_name')['customer_id'].count().sort_values(ascending=False))

In [None]:
# Customer Master Status
print("Master vs. Sub-Customer Distribution:")
print(dim_customer['is_master'].value_counts())