In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load original dataset
file_path = "../datasets/2024-08-01_LMS_data_2023.xlsx"
customer_df = pd.read_excel(file_path, 'customer')

In [None]:
# Basic Counts
print("=== Basic Counts ===")
print(f"Total customer records: {len(customer_df)}")
print(f"Unique customer IDs: {customer_df['customer_id'].nunique()}")
print(f"Unique sequence numbers: {customer_df['sequencenumber'].nunique()}")
print(f"Unique structure numbers: {customer_df['structurenumber'].nunique()}")

In [None]:
# Check for duplicate customer IDs
duplicate_ids = customer_df[customer_df['customer_id'].duplicated()]
print(f"\nDuplicate customer IDs: {len(duplicate_ids)}")

In [None]:
# Analyze relationship between customer_id and sequencenumber
customer_sequence_mapping = customer_df.groupby('customer_id')['sequencenumber'].nunique()
sequence_customer_mapping = customer_df.groupby('sequencenumber')['customer_id'].nunique()

print("\n=== Customer ID to Sequence Number Relationship ===")
print(f"Customers with multiple sequence numbers: {len(customer_sequence_mapping[customer_sequence_mapping > 1])}")
print(f"Sequence numbers with multiple customer IDs: {len(sequence_customer_mapping[sequence_customer_mapping > 1])}")


In [None]:
# Analyze master and sub relationships
masters = customer_df[customer_df['sequencenumber'] == customer_df['structurenumber']]
subs = customer_df[customer_df['sequencenumber'] != customer_df['structurenumber']]

print("\n=== Master/Sub Structure Analysis ===")
print(f"Master customers: {len(masters)}")
print(f"Sub companies: {len(subs)}")


In [None]:
# Show example of hierarchy
if len(subs) > 0:
    sample_structure = subs['structurenumber'].iloc[0]
    print("\nExample of one hierarchy:")
    print(customer_df[customer_df['structurenumber'] == sample_structure]\
          [['customer_id', 'sequencenumber', 'structurenumber', 'segmentation']]\
          .sort_values('sequencenumber'))


In [None]:
# Check for NaN sequence numbers
print("NaN sequence numbers:")
nan_sequences = customer_df[customer_df['sequencenumber'].isna()]
print(f"Number of NaN sequence numbers: {len(nan_sequences)}")

In [None]:
# Check sequence numbers with multiple customer_ids
duplicate_sequences = customer_df.groupby('sequencenumber')['customer_id'].agg(['count']).query('count > 1')
print("\nSequence numbers with multiple customer_ids:")
print(f"Count: {len(duplicate_sequences)}")
print("\nSample of sequence numbers with multiple customer_ids:")
print(duplicate_sequences.head())

In [None]:
# Analyze all cases of duplicate sequence numbers
print("\nDetailed analysis of all duplicate sequence numbers:")
duplicate_cases = pd.DataFrame()

for seq_num in duplicate_sequences.index:
    cases = customer_df[customer_df['sequencenumber'] == seq_num].sort_values('customer_id')
    duplicate_cases = pd.concat([duplicate_cases, cases])

print("\nAll cases where sequence numbers are shared:")
print(duplicate_cases[['customer_id', 'sequencenumber', 'structurenumber', 'segmentation']].to_string())


In [None]:
# Check patterns in duplicate cases
print("\nPatterns in duplicate cases:")
print("1. Master/Sub company distribution:")
master_count = len(duplicate_cases[duplicate_cases['sequencenumber'] == duplicate_cases['structurenumber']])
sub_count = len(duplicate_cases[duplicate_cases['sequencenumber'] != duplicate_cases['structurenumber']])
print(f"Master companies: {master_count}")
print(f"Sub companies: {sub_count}")

print("\n2. Segmentation distribution:")
print(duplicate_cases['segmentation'].value_counts().sort_index())

print("\n3. Check if structure numbers are also duplicated:")
structure_dupes = duplicate_cases.groupby('sequencenumber')['structurenumber'].nunique()
print("Number of unique structure numbers per sequence number:")
print(structure_dupes.value_counts().sort_index())

In [None]:
def analyze_duplicate_sequence_impact(customer_df):
    """
    Analyze the impact of dropping customers with duplicate sequence numbers
    """
    # Identify customers with duplicate sequence numbers
    sequence_counts = customer_df['sequencenumber'].value_counts()
    duplicate_sequences = sequence_counts[sequence_counts > 1].index
    
    # Get all customers with duplicate sequence numbers
    customers_to_drop = customer_df[customer_df['sequencenumber'].isin(duplicate_sequences)]
    
    print("=== Impact Analysis of Dropping Customers ===\n")
    print(f"Number of customers to potentially drop: {len(customers_to_drop)}")
    
    # Check if any of these customers are masters for other customers
    potential_masters = customers_to_drop['sequencenumber'].unique()
    dependent_customers = customer_df[
        (customer_df['structurenumber'].isin(potential_masters)) & 
        (~customer_df['customer_id'].isin(customers_to_drop['customer_id']))
    ]
    
    print("\nDependency Analysis:")
    print(f"Number of dependent customers found: {len(dependent_customers)}")
    
    if len(dependent_customers) > 0:
        print("\nDetailed Dependency Breakdown:")
        print("\nMaster-Sub Relationships at Risk:")
        for master_seq in dependent_customers['structurenumber'].unique():
            master = customers_to_drop[customers_to_drop['sequencenumber'] == master_seq]
            dependents = dependent_customers[dependent_customers['structurenumber'] == master_seq]
            
            print(f"\nMaster Sequence Number: {master_seq}")
            print(f"Number of dependent customers: {len(dependents)}")
            print("\nMaster Details:")
            print(master[['customer_id', 'sequencenumber', 'structurenumber', 'segmentation']].to_string())
            print("\nDependent Customers:")
            print(dependents[['customer_id', 'sequencenumber', 'structurenumber', 'segmentation']].to_string())
    
    # Analyze the segmentation distribution of affected customers
    print("\nSegmentation Distribution of Customers to Drop:")
    print(customers_to_drop['segmentation'].value_counts().sort_index())
    
    # Check for any masters among the duplicates
    masters_to_drop = customers_to_drop[
        customers_to_drop['sequencenumber'] == customers_to_drop['structurenumber']
    ]
    print(f"\nNumber of master companies among duplicates: {len(masters_to_drop)}")
    
    return {
        'customers_to_drop': customers_to_drop,
        'dependent_customers': dependent_customers,
        'masters_to_drop': masters_to_drop
    }

# Run the analysis
impact_analysis = analyze_duplicate_sequence_impact(customer_df)