### Evaluation Function

In [342]:
import pandas as pd

In [343]:
import pandas as pd

def evaluate_clusters(df, threshold_percent, weights):
    # Convert all columns to strings
    df = df.astype(str)
    df.columns = ['ID', 'material', 'group']
    
    # Aggregate materials per ID within each group
    aggregated_df = df.groupby(['ID', 'group'])['material'].apply(set).reset_index()
    
    # Calculate union
    union = aggregated_df.groupby('group').agg({'material': lambda x: set().union(*x)}).reset_index()
    
    # Calculate soft intersection based on the threshold presence
    def calc_intersection(group_df):
        total_ids = group_df['ID'].nunique()
        material_counts = group_df['material'].explode().value_counts()
        threshold = total_ids * threshold_percent
        return set(material_counts[material_counts > threshold].index)
    
    intersection = df.groupby('group').apply(calc_intersection).reset_index(name='material')
    
    # Create metrics DataFrame
    metrics = pd.DataFrame()
    metrics['Group'] = union['group']
    metrics['Union'] = union['material']
    metrics['Intersection'] = intersection['material']
    
    # Add 'Evaluation' column
    metrics['Evaluation'] = metrics.apply(lambda row: len(row['Intersection']) / len(row['Union']) if len(row['Union']) > 0 else 0, axis=1)
    
    metrics['Weight'] = weights

    metrics['Weighted Evaluation'] = metrics['Evaluation'] * metrics['Weight']

    metrics.loc[-1] = ['Weighted Average', None, None, metrics['Weighted Evaluation'].sum()/metrics['Weight'].sum(), None, None]  # adding a row
    
    return metrics


### Data

In [344]:
sales = pd.read_csv("ventas.csv")
sales.drop(['calmonth', 'uni_box'], axis = 1, inplace = True)
sales.head()

Unnamed: 0,CustomerId,material
0,499920078,9151
1,499920078,2287
2,499920078,4526
3,499920078,14050
4,499920078,1333


In [345]:
wc = [sales['CustomerId'].nunique()]

In [346]:
clusters = pd.read_csv('Aggregated_Clusters_3.csv')
clusters.drop(clusters.columns[0], axis = 1, inplace = True)
clusters.head()

Unnamed: 0,CustomerId,KMEANS_AGG,KMEANS_AGG_PCA,KMEANS_AGG_MCA,KMEANS_AGG_UMAP
0,499920078,0,1,3,4
1,499920499,0,1,3,4
2,499921473,2,4,1,2
3,499921557,4,2,1,2
4,499921908,4,2,1,2


In [347]:
wa = list(clusters['KMEANS_AGG'].value_counts())
wapca = list(clusters['KMEANS_AGG_PCA'].value_counts())
wamca = list(clusters['KMEANS_AGG_MCA'].value_counts())
waumap = list(clusters['KMEANS_AGG_UMAP'].value_counts())

In [348]:
customers = pd.read_csv('customers_sampled.csv')
customers = customers[['CustomerId', 'sub_canal_comercial']]
customers.head()

Unnamed: 0,CustomerId,sub_canal_comercial
0,499920078,Estanquillos / kioscos
1,499920499,Abarrotes / Almacenes / Bodegas / Víveres
2,499921473,Estanquillos / kioscos
3,499921557,Abarrotes / Almacenes / Bodegas / Víveres
4,499921908,Estanquillos / kioscos


In [349]:
ws = list(customers['sub_canal_comercial'].value_counts())

In [364]:
kmodes = pd.read_csv('KModes_Agg_Cluster.csv')
kmodes.drop(kmodes.columns[0], axis = 1, inplace = True)
kmodes.head()
# #kmodes = kmodes.groupby('CustomerId')['Cluster'].agg(lambda x: x.mode()[0]).reset_index()

# #kmodes.head()
# #kmodes['Cluster'].value_counts()

Unnamed: 0,CustomerId,Cluster
0,499920078,2
1,499920499,2
2,499921473,2
3,499921557,1
4,499921908,1


In [367]:
wmode = list(kmodes['Cluster'].value_counts())

In [351]:
# import pandas as pd

# # Load the data
# kmodes = pd.read_csv('K-Modes_Clusters.csv')
# kmodes.drop(kmodes.columns[0], axis=1, inplace=True)

# # Check for multiple clusters per CustomerId
# multiple_clusters = kmodes.groupby('CustomerId')['Cluster'].nunique() > 1

# # Filter CustomerIds with more than one unique cluster
# ids_with_multiple_clusters = multiple_clusters[multiple_clusters].index.tolist()

# if ids_with_multiple_clusters:
#     print("CustomerIds with different clusters assigned:", len(ids_with_multiple_clusters))
# else:
#     print("No CustomerIds have different clusters assigned.")

In [352]:
# kmodes.info()

### Evaluation

##### Control

In [353]:
control_groups = sales.copy()
control_groups['group'] = '1'
control_metrics = evaluate_clusters(control_groups, 0.50, wc)
control_metrics

Unnamed: 0,Group,Union,Intersection,Evaluation,Weight,Weighted Evaluation
0,1,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{784, 4130, 9087, 9486, 14060, 5011, 779, 2112...",0.379481,2025.0,768.448702
-1,Weighted Average,,,0.379481,,


##### Aggregated K-Means

In [354]:
AGG_kmeans_groups = sales.merge(clusters[['CustomerId', 'KMEANS_AGG']], on='CustomerId', how='left')
AGG_kmeans_metrics = evaluate_clusters(AGG_kmeans_groups, 0.50, wa)
AGG_kmeans_metrics

Unnamed: 0,Group,Union,Intersection,Evaluation,Weight,Weighted Evaluation
0,0,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{784, 4130, 9087, 9486, 14060, 5011, 779, 2112...",0.435762,731.0,318.541722
1,1,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{4643, 375, 533, 4130, 9087, 101, 9208, 5011, ...",0.172511,561.0,96.778401
2,2,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 5011, 22...",0.527851,356.0,187.915119
3,3,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{2485, 1334, 4643, 375, 533, 2251, 4130, 4975,...",0.335128,253.0,84.787349
4,4,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 2230, 50...",0.631081,124.0,78.254054
-1,Weighted Average,,,0.378408,,


##### Aggregated PCA K-Means

In [355]:
AGG_kmeans_pca_groups = sales.merge(clusters[['CustomerId', 'KMEANS_AGG_PCA']], on='CustomerId', how='left')
AGG_kmeans_pca_metrics = evaluate_clusters(AGG_kmeans_pca_groups, 0.50, wapca)
AGG_kmeans_pca_metrics

Unnamed: 0,Group,Union,Intersection,Evaluation,Weight,Weighted Evaluation
0,0,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{2485, 1334, 4643, 375, 533, 2251, 4130, 4975,...",0.335128,732.0,245.313594
1,1,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{784, 4130, 9087, 9486, 14060, 5011, 779, 2112...",0.435762,560.0,244.02649
2,2,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 2230, 50...",0.631081,356.0,224.664865
3,3,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{4643, 375, 533, 4130, 9087, 101, 9208, 5011, ...",0.173913,253.0,44.0
4,4,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 5011, 22...",0.527851,124.0,65.453581
-1,Weighted Average,,,0.406646,,


#### Aggregated MCA K-Means

In [356]:
AGG_kmeans_mca_groups = sales.merge(clusters[['CustomerId', 'KMEANS_AGG_MCA']], on='CustomerId', how='left')
AGG_kmeans_mca_metrics = evaluate_clusters(AGG_kmeans_mca_groups, 0.50, wamca)
AGG_kmeans_mca_metrics

Unnamed: 0,Group,Union,Intersection,Evaluation,Weight,Weighted Evaluation
0,0,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 50...","{2485, 1334, 4643, 375, 533, 2251, 4130, 9087,...",0.304539,763.0,232.363104
1,1,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 2230, 77...",0.438303,731.0,320.399743
2,2,"{784, 4130, 9087, 9486, 9673, 14060, 5011, 13,...","{2485, 533, 4130, 9087, 101, 9208, 9872, 2287,...",0.201507,359.0,72.340866
3,3,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{2485, 1334, 4643, 375, 533, 2251, 4130, 4975,...",0.388669,134.0,52.081686
4,4,"{4643, 375, 4130, 14251, 9087, 9486, 303, 2186...","{2485, 9207, 119, 14457, 4130, 4975, 2483, 274...",0.172185,38.0,6.543046
-1,Weighted Average,,,0.337644,,


##### Aggregated UMAP K-Means

In [357]:
AGG_kmeans_umap_groups = sales.merge(clusters[['CustomerId', 'KMEANS_AGG_UMAP']], on='CustomerId', how='left')
AGG_kmeans_umap_metrics = evaluate_clusters(AGG_kmeans_umap_groups, 0.50, waumap)
AGG_kmeans_umap_metrics

Unnamed: 0,Group,Union,Intersection,Evaluation,Weight,Weighted Evaluation
0,0,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 50...","{9207, 9107, 452, 550, 4130, 520, 501, 9087, 1...",0.096346,451.0,43.451827
1,1,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{1334, 4643, 375, 533, 2251, 4130, 4975, 9087,...",0.33015,443.0,146.25648
2,2,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 5011, 22...",0.559126,432.0,241.542416
3,3,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{4643, 375, 533, 4130, 9087, 9486, 101, 9208, ...",0.236805,363.0,85.960057
4,4,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{784, 4130, 9087, 9486, 14060, 5011, 779, 2112...",0.437418,336.0,146.972332
-1,Weighted Average,,,0.327992,,


##### Sub Canal Comercial

In [358]:
default_groups = sales.merge(customers, on='CustomerId', how='left')
default_metrics = evaluate_clusters(default_groups, 0.50, ws)
default_metrics

Unnamed: 0,Group,Union,Intersection,Evaluation,Weight,Weighted Evaluation
0,Abarrotes / Almacenes / Bodegas / Víveres,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{784, 4130, 9087, 9486, 14060, 5011, 14311, 19...",0.462725,586.0,271.156812
1,Carnicería / Pollería / Pescadería,"{784, 4130, 9087, 9486, 14296, 9673, 2230, 140...","{1334, 4643, 784, 533, 375, 2251, 4130, 780, 4...",0.446691,570.0,254.613971
2,Cerveza y Licores,"{784, 4130, 9087, 9486, 14296, 2230, 14060, 50...","{1334, 4643, 375, 533, 2251, 4130, 4975, 9087,...",0.394895,439.0,173.358859
3,Estanquillos / kioscos,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{2485, 1334, 4643, 375, 533, 2251, 784, 4130, ...",0.398148,173.0,68.87963
4,Farmacia Independiente,"{4643, 375, 784, 2251, 4130, 14251, 9087, 9486...","{1334, 4643, 375, 533, 784, 4130, 4975, 9087, ...",0.408034,50.0,20.401691
5,Frutas y Verduras,"{784, 4130, 9087, 9486, 14296, 14060, 2230, 13...","{1334, 4643, 375, 533, 2251, 4130, 4975, 9087,...",0.424731,46.0,19.537634
6,Hogar con Venta,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 50...","{4643, 375, 533, 4130, 9087, 9486, 101, 9208, ...",0.226415,44.0,9.962264
7,Mayorista Abarrotero,"{784, 4130, 9087, 9486, 9673, 14060, 2230, 13,...","{1334, 4643, 375, 533, 2251, 784, 409, 4130, 6...",0.547206,32.0,17.510597
8,Minisuper / Minimarket,"{784, 4130, 9486, 9087, 14296, 14060, 2230, 98...","{784, 4130, 9087, 9486, 14060, 2230, 9871, 779...",0.562602,28.0,15.752846
9,Panadería / Pastelería,"{4643, 375, 2251, 4130, 14251, 9087, 9486, 303...","{2485, 1334, 4643, 375, 533, 4130, 4975, 9087,...",0.532225,28.0,14.902287


##### Aggregated K-Modes

In [368]:
AGG_kmodes_groups = sales.merge(kmodes, on='CustomerId', how='left')
AGG_kmeans_metrics = evaluate_clusters(AGG_kmeans_groups, 0.50, wmode)
AGG_kmeans_metrics

Unnamed: 0,Group,Union,Intersection,Evaluation,Weight,Weighted Evaluation
0,0,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{784, 4130, 9087, 9486, 14060, 5011, 779, 2112...",0.435762,1468.0,639.698013
1,1,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{4643, 375, 533, 4130, 9087, 101, 9208, 5011, ...",0.172511,465.0,80.217391
2,2,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 5011, 22...",0.527851,52.0,27.448276
3,3,"{784, 4130, 9087, 9486, 14296, 9673, 14060, 22...","{2485, 1334, 4643, 375, 533, 2251, 4130, 4975,...",0.335128,33.0,11.059219
4,4,"{784, 4130, 9087, 9486, 9673, 14296, 14060, 22...","{784, 4130, 9087, 9486, 14296, 14060, 2230, 50...",0.631081,7.0,4.417568
-1,Weighted Average,,,0.376711,,
