In [2]:
import pandas as pd
import pickle
import multiprocessing

In [3]:
df = pd.read_csv("banking_conv.csv")

In [4]:
df

Unnamed: 0,text,category
0,I am still waiting on my card?,card_arrival
1,What can I do if my card still hasn't arrived ...,card_arrival
2,I have been waiting over a week. Is the card s...,card_arrival
3,Can I track my card while it is in the process...,card_arrival
4,"How do I know if I will get my card, or if it ...",card_arrival
...,...,...
9998,You provide support in what countries?,country_support
9999,What countries are you supporting?,country_support
10000,What countries are getting support?,country_support
10001,Are cards available in the EU?,country_support


In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

## Representation model with LLM

In [6]:
from ctransformers import AutoModelForCausalLM, AutoConfig
from transformers import AutoTokenizer, pipeline

LLM_name = "TheBloke/zephyr-7B-alpha-GGUF"
LLM_file = "zephyr-7b-alpha.Q4_K_M.gguf"

config = AutoConfig.from_pretrained(LLM_name)
# Explicitly set the max_seq_len
config.config.max_new_tokens = 32
config.config.context_length = 4096


# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    LLM_name,
    model_file = LLM_file,
    model_type = "mistral",
    gpu_layers=0,
    config = config,
    hf=True
)

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")#,config = config)


# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)

prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from bertopic.representation import TextGeneration

# Topic Modeling

# df = df.sample(frac=0.3)
docs = [item for item in df["text"] if isinstance(item, str)]

min_cluster_size = 20

umap_model = UMAP(n_neighbors=min_cluster_size, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer()
ctfidf_model = ClassTfidfTransformer()

zephyr = TextGeneration(generator, prompt=prompt)
representation_model = {"Zephyr": zephyr}

topic_model = BERTopic(embedding_model="all-MiniLM-L12-v2",
                       umap_model=umap_model,hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model, 
                       verbose=True)
topics, probs = topic_model.fit_transform(docs)
dfs_topic = topic_model.get_document_info(docs)

def embed(text):
    return topic_model._extract_embeddings(text)[0]

def clean_llm_text(text):
    return ' '.join([item.strip('" \n') for item in text if item.strip()])
    
dfs_topic['embedding'] = dfs_topic['Document'].apply(embed)
dfs_topic['Zephyr'] = dfs_topic['Zephyr'].apply(clean_llm_text)

2024-02-23 15:47:21,992 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2024-02-23 15:48:48,271 - BERTopic - Embedding - Completed ✓
2024-02-23 15:48:48,272 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-23 15:49:17,451 - BERTopic - Dimensionality - Completed ✓
2024-02-23 15:49:17,453 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-23 15:49:17,752 - BERTopic - Cluster - Completed ✓
2024-02-23 15:49:17,761 - BERTopic - Representation - Extracting topics from clusters using representation models.
  7%|█████▎                                                                         | 11/165 [05:44<1:17:50, 30.33s/it]

In [27]:
dfs_topic.head(2)

Unnamed: 0,Document,Topic,Name,Representation,Zephyr,Representative_Docs,Top_n_words,Probability,Representative_document,embedding
0,There is a payment that I made that hasn't gon...,2,2_payment_my_card_declined,"[payment, my, card, declined, debit, pending, ...",Payment Issues with My Card,"[Why is the payment on my card still pending?,...",payment - my - card - declined - debit - pendi...,0.850679,False,"[-0.027261687, -0.008652397, 0.050006807, 0.02..."
1,Yesterday I had my phone stolen. Please advis...,13,13_phone_passcode_app_stolen,"[phone, passcode, app, stolen, my, do, lost, p...",Stolen Phone: App Usage & Recovery,"[What do I do if I lost my phone?, What do I d...",phone - passcode - app - stolen - my - do - lo...,0.874444,False,"[-0.029332826, 0.055583503, 0.031581942, -0.11..."


In [28]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Zephyr,Representative_Docs
0,-1,302,-1_top_up_to_beneficiary,"[top, up, to, beneficiary, my, is, can, where,...","[\n""Google/Apple Pay Top Up Issues"", , , , , ,...",[Why can't I get my top up to work in Apple Pa...
1,0,412,0_card_new_my_do,"[card, new, my, do, it, get, to, can, activate...","[\n""Activating and Linking New Cards"", , , , ,...",[How long does it take for me to get my new ca...
2,1,361,1_transfer_it_long_the,"[transfer, it, long, the, to, account, money, ...","[\n""Transfer Timeframe"", , , , , , , , , ]",[I made a transfer from France a couple days a...
3,2,278,2_payment_my_card_declined,"[payment, my, card, declined, debit, pending, ...","[\nPayment Issues with My Card, , , , , , , , , ]","[Why is the payment on my card still pending?,..."
4,3,207,3_charged_fee_extra_charge,"[charged, fee, extra, charge, for, why, an, on...","[\n""Card Payment Fees"", , , , , , , , , ]",[Why have I been charged a fee for card paymen...
5,4,176,4_atm_cash_withdrawal_the,"[atm, cash, withdrawal, the, amount, withdraw,...","[\n""ATM Cash Withdrawal Issues"", , , , , , , ,...","[Why can't I withdraw cash from this ATM?, My ..."
6,5,161,5_currencies_currency_exchange_you,"[currencies, currency, exchange, you, countrie...","[\n""Currency Exchange Options"", , , , , , , , , ]","[Which currencies will you exchange?, need to ..."
7,6,140,6_top_up_auto_card,"[top, up, auto, card, can, do, what, the, is, to]","[\n""Card Top-Up Instructions"", , , , , , , , , ]","[How do I use my card to top up?, How do I top..."
8,7,106,7_pin_change_unblock_my,"[pin, change, unblock, my, can, too, do, to, m...","[\n""PIN Change Instructions"", , , , , , , , , ]","[What do I have to do to change my pin?, Can I..."
9,8,106,8_rate_exchange_wrong_the,"[rate, exchange, wrong, the, applied, was, inc...","[\n""Incorrect exchange rates"", , , , , , , , , ]","[the exchange rate for my withdrawal is wrong,..."


In [30]:
dfs_topic.to_pickle('embedding_L12_clustered_LLM_banking_new.pkl')

pd.read_pickle("embedding_L12_clustered_LLM_banking_new.pkl")["Zephyr"].unique()

array(['Payment Issues with My Card',
       'Stolen Phone: App Usage & Recovery', 'PIN Change Instructions',
       'Card Payment Fees', 'Transfer Timeframe', 'Top Up Pending Issues',
       'Google/Apple Pay Top Up Issues', 'Account Details Change',
       'Activating and Linking New Cards',
       'Adding funds to account via payment methods',
       'ATM Cash Withdrawal Issues', 'Disposable Virtual Cards',
       'Refund Inquiries', 'Card Top-Up Instructions',
       'Currency Exchange Options', 'Identity Verification Guidelines',
       'Incorrect exchange rates', 'Cancel Transaction (Mistake)',
       'ATM Cash Withdrawal Fee Charge'], dtype=object)

In [6]:
embedding_llm = pd.read_pickle("embedding_L12_clustered_LLM_banking_new.pkl")

In [12]:
embedding_llm

Unnamed: 0,Document,Topic,Name,Representation,Zephyr,Representative_Docs,Top_n_words,Probability,Representative_document,embedding
0,There is a payment that I made that hasn't gon...,2,2_payment_my_card_declined,"[payment, my, card, declined, debit, pending, ...",Payment Issues with My Card,"[Why is the payment on my card still pending?,...",payment - my - card - declined - debit - pendi...,0.850679,False,"[-0.027261687, -0.008652397, 0.050006807, 0.02..."
1,Yesterday I had my phone stolen. Please advis...,13,13_phone_passcode_app_stolen,"[phone, passcode, app, stolen, my, do, lost, p...",Stolen Phone: App Usage & Recovery,"[What do I do if I lost my phone?, What do I d...",phone - passcode - app - stolen - my - do - lo...,0.874444,False,"[-0.029332826, 0.055583503, 0.031581942, -0.11..."
2,My account is locked because I used the wrong ...,7,7_pin_change_unblock_my,"[pin, change, unblock, my, can, too, do, to, m...",PIN Change Instructions,"[What do I have to do to change my pin?, Can I...",pin - change - unblock - my - can - too - do -...,0.778512,False,"[-0.03897612, 0.0229855, 0.03359894, 0.0223477..."
3,How come a fee was charged when I tranferred m...,3,3_charged_fee_extra_charge,"[charged, fee, extra, charge, for, why, an, on...",Card Payment Fees,[Why have I been charged a fee for card paymen...,charged - fee - extra - charge - for - why - a...,0.691894,False,"[0.017397415, 0.050200198, -0.05818079, 0.0484..."
4,Transfers from the US how long will the wait be?,1,1_transfer_it_long_the,"[transfer, it, long, the, to, account, money, ...",Transfer Timeframe,[I made a transfer from France a couple days a...,transfer - it - long - the - to - account - mo...,1.000000,False,"[-0.078121305, -0.11529321, -0.053448346, -0.0..."
...,...,...,...,...,...,...,...,...,...,...
2996,My payment shows that it is pending will I be ...,2,2_payment_my_card_declined,"[payment, my, card, declined, debit, pending, ...",Payment Issues with My Card,"[Why is the payment on my card still pending?,...",payment - my - card - declined - debit - pendi...,0.738392,False,"[0.0021469349, -0.010928848, 0.018210975, 0.06..."
2997,Does a US transfer take long,1,1_transfer_it_long_the,"[transfer, it, long, the, to, account, money, ...",Transfer Timeframe,[I made a transfer from France a couple days a...,transfer - it - long - the - to - account - mo...,1.000000,False,"[-0.08347549, -0.08342955, -0.06266544, -0.030..."
2998,Am I able to get both a Visa and MasterCard fr...,0,0_card_new_my_do,"[card, new, my, do, it, get, to, can, activate...",Activating and Linking New Cards,[How long does it take for me to get my new ca...,card - new - my - do - it - get - to - can - a...,0.568783,False,"[0.07526371, 0.010289823, 0.047005657, 0.05421..."
2999,my virtual card has not came yet!,9,9_virtual_disposable_cards_card,"[virtual, disposable, cards, card, can, many, ...",Disposable Virtual Cards,"[Is there a disposable virtual card?, Can I ge...",virtual - disposable - cards - card - can - ma...,0.934542,False,"[0.026032805, -0.010709709, 0.11159341, 0.0042..."


In [26]:
df = df.rename(columns={"text": "Document"})
df['Document'] = df['Document'].astype('string')
embedding_llm['Document'] = embedding_llm['Document'].astype('string')
print(embedding_llm.Document.dtype)
# df.join(embedding_llm, on='Document', how='inner')
merged_df = pd.merge(df, embedding_llm, on='Document', suffixes=('_left', '_right'))
merged_df.to_pickle("embedding_L12_clustered_LLM_banking_classes_new.pkl")


string


In [27]:
embedding_llm_classes = pd.read_pickle("embedding_L12_clustered_LLM_banking_classes_new.pkl")