In [4]:
import numpy as np
import pandas as pd

def normalize_vector(vector):
    total = np.sum(vector)
    if total == 0:
        return vector
    return vector / total

def gini_coefficient(vector):
    sorted_vector = np.sort(vector)
    n = len(vector)
    cumulative_sum = np.cumsum(sorted_vector)
    gini = (n + 1 - 2 * np.sum(cumulative_sum) / cumulative_sum[-1]) / n
    return gini

def entropy(vector):
    normalized_vector = normalize_vector(vector)
    entropy = -np.sum(normalized_vector * np.log2(normalized_vector + 1e-10)) # Adding small constant to avoid log(0)
    return entropy

def dispersion_measures(vectors):
    gini_values = [gini_coefficient(vector) for vector in vectors]
    entropy_values = [entropy(vector) for vector in vectors]
    return gini_values, entropy_values

def extract_lowest_values(vector):
    indexed_vector = list(enumerate(vector))
    sorted_vector = sorted(indexed_vector, key=lambda x: x[1])
    lowest_values = sorted_vector[:3]
    return lowest_values

def extract_highest_values(vector):
    indexed_vector = list(enumerate(vector))
    sorted_vector = sorted(indexed_vector, key=lambda x: x[1], reverse=True)
    highest_values = sorted_vector[:3]
    return highest_values



In [48]:
df = pd.read_pickle("embedding_L12_clustered_LLM_bank_47_reclustered.pkl")

df.head()

confusion_matrix = pd.crosstab(df['Original_label'], df['Predicted Label'])

row_vectors = [confusion_matrix.loc[label] for label in confusion_matrix.index]
column_vectors = [confusion_matrix[label] for label in confusion_matrix.columns]

confusion_matrix

Predicted Label,ATM Cash Withdrawal Pending Issues,ATM/Cash Withdrawal Fees,Account Deletion Requests,Account Details Change,Activating New Card,Adding Money to American Express Accounts,Apple Pay Top-Up,Auto Top-Up Feature,Bank Account Transfers,Beneficiary Transfer Restrictions,...,Stolen Card Detection & Freeze,Stolen Phone: What to Do?,Top-Up Limits,Transfer Fees,Transfer Time,US Delivery Time,Unreceived Transactions,Unreceived Transferred Money,Unresolved UK Transfer Issues,Visa/Mastercard Acquisition Options
Original_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Refund_not_showing_up,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
activate_my_card,0,0,0,0,152,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
age_limit,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
apple_pay_or_google_pay,0,0,0,0,0,0,123,0,0,0,...,0,0,0,0,0,0,0,0,0,0
atm_support,84,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
virtual_card_not_working,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
visa_or_mastercard,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,122
why_verify_identity,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wrong_amount_of_cash_received,178,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
print("Quanto sono puri i topic ORIGINALI rispetto ai previsti")
vectors = row_vectors
gini_values, entropy_values = dispersion_measures(vectors)
#print("Gini coefficients:", gini_values) #high = pure
#print("Entropies:", entropy_values) #low = pure

print("Gini Mean:", sum(gini_values)/len(gini_values)) #high = pure
print("Entropy Mean:", sum(entropy_values)/len(entropy_values)) #low = pure

lowest_values = extract_lowest_values(gini_values)
highest_values = extract_highest_values(entropy_values)

print("Worst clusters by Gini")
for index, value in lowest_values:
    print(f"Index: {index}, Value: {value}, Cluster: {confusion_matrix.index[index]}")

print("Worst clusters by Entropy")
for index, value in highest_values:
    print(f"Index: {index}, Value: {value}, Cluster: {confusion_matrix.index[index]}")

Quanto sono puri i topic ORIGINALI rispetto ai previsti
Gini Mean: 0.9713408482654264
Entropy Mean: 0.8900977754993852
Worst clusters by Gini
Index: 62, Value: 0.9138542148250886, Cluster: topping_up_by_card
Index: 66, Value: 0.926575698505523, Cluster: transfer_not_received_by_recipient
Index: 6, Value: 0.9314025805253875, Cluster: balance_not_updated_after_bank_transfer
Worst clusters by Entropy
Index: 62, Value: 2.7693269766821915, Cluster: topping_up_by_card
Index: 66, Value: 2.599321204103741, Cluster: transfer_not_received_by_recipient
Index: 6, Value: 2.4218642757001705, Cluster: balance_not_updated_after_bank_transfer


In [50]:
print("Quanto sono puri i topic PREVISTI rispetto agli originali")
vectors = column_vectors
gini_values, entropy_values = dispersion_measures(vectors)
#print("Gini coefficients:", gini_values) #high = pure
#print("Entropies:", entropy_values) #low = pure

print("Gini Mean:", sum(gini_values)/len(gini_values)) #high = pure
print("Entropy Mean:", sum(entropy_values)/len(entropy_values)) #low = pure

lowest_values = extract_lowest_values(gini_values)
highest_values = extract_highest_values(entropy_values)

print("Worst clusters by Gini")
for index, value in lowest_values:
    print(f"Index: {index}, Value: {value}, Cluster: {confusion_matrix.columns[index]}")

print("Worst clusters by Entropy")
for index, value in highest_values:
    print(f"Index: {index}, Value: {value}, Cluster: {confusion_matrix.columns[index]}")

Quanto sono puri i topic PREVISTI rispetto agli originali
Gini Mean: 0.969202063380193
Entropy Mean: 1.1623742670654449
Worst clusters by Gini
Index: 59, Value: 0.8936018110330037, Cluster: Unreceived Transactions
Index: 43, Value: 0.9039587863117274, Cluster: Ordering New Cards
Index: 41, Value: 0.9197737746124843, Cluster: Mortgage Payment Error
Worst clusters by Entropy
Index: 59, Value: 3.38858880161513, Cluster: Unreceived Transactions
Index: 43, Value: 3.222123234589623, Cluster: Ordering New Cards
Index: 41, Value: 2.99084182654298, Cluster: Mortgage Payment Error
