In [1]:
import pandas as pd
import pickle
import multiprocessing

In [2]:
def open_dataset(path):
    df = pd.read_excel(path)
    return df

def clean_dataset(df):
    df = df[['Text']].dropna().drop_duplicates()
    df['Text'] = df['Text'].str.strip()
    rows_to_drop = df[df['Text']==''].index
    df.drop(rows_to_drop, inplace=True)
    return df

df_raw = open_dataset("./Chat_Team_CaseStudy FINAL.xlsx")

In [3]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

In [73]:
df = clean_dataset(df_raw)#.sample(frac=0.2)
docs = [item for item in df["Text"] if isinstance(item, str)]

umap_model = UMAP(n_neighbors=130, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()


topic_model = BERTopic(embedding_model="all-MiniLM-L12-v2",umap_model=umap_model,hdbscan_model=hdbscan_model,vectorizer_model=vectorizer_model,ctfidf_model=ctfidf_model,representation_model=representation_model)
topics, probs = topic_model.fit_transform(docs)
dfs_topic = topic_model.get_document_info(docs)

def embed(text):
    return topic_model._extract_embeddings(text)[0]

dfs_topic['embedding'] = dfs_topic['Document'].apply(embed)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
dfs_topic.head(10)

In [None]:
topic_model.get_topic_info()

In [75]:
#dfs_topic.to_pickle('embedding_L12_clustered.pkl')

## Representation model with LLM

In [5]:
from ctransformers import AutoModelForCausalLM, AutoConfig
from transformers import AutoTokenizer, pipeline

LLM_name = "TheBloke/zephyr-7B-alpha-GGUF"
LLM_file = "zephyr-7b-alpha.Q4_K_M.gguf"

config = AutoConfig.from_pretrained(LLM_name)
# Explicitly set the max_seq_len
config.config.max_new_tokens = 32
config.config.context_length = 4096


# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    LLM_name,
    model_file = LLM_file,
    model_type = "mistral",
    gpu_layers=0,
    config = config,
    hf=True
)

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")#,config = config)


# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)

prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
from bertopic.representation import TextGeneration

# Topic Modeling

df = clean_dataset(df_raw).sample(frac=0.3)
docs = [item for item in df["Text"] if isinstance(item, str)]

min_cluster_size = 50

umap_model = UMAP(n_neighbors=min_cluster_size, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer()
ctfidf_model = ClassTfidfTransformer()

zephyr = TextGeneration(generator, prompt=prompt)
representation_model = {"Zephyr": zephyr}

topic_model = BERTopic(embedding_model="all-MiniLM-L12-v2",umap_model=umap_model,hdbscan_model=hdbscan_model,vectorizer_model=vectorizer_model,ctfidf_model=ctfidf_model,representation_model=representation_model, verbose=True)
topics, probs = topic_model.fit_transform(docs)
dfs_topic = topic_model.get_document_info(docs)

def embed(text):
    return topic_model._extract_embeddings(text)[0]

def clean_llm_text(text):
    return ' '.join([item.strip('" \n') for item in text if item.strip()])
    
dfs_topic['embedding'] = dfs_topic['Document'].apply(embed)
dfs_topic['Zephyr'] = dfs_topic['Zephyr'].apply(clean_llm_text)

2024-02-21 21:50:07,730 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/171 [00:00<?, ?it/s]

2024-02-21 21:50:18,805 - BERTopic - Embedding - Completed ✓
2024-02-21 21:50:18,805 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-21 21:50:27,545 - BERTopic - Dimensionality - Completed ✓
2024-02-21 21:50:27,545 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-21 21:50:27,651 - BERTopic - Cluster - Completed ✓
2024-02-21 21:50:27,653 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 16/16 [07:20<00:00, 27.52s/it]
2024-02-21 21:57:48,161 - BERTopic - Representation - Completed ✓


In [33]:
dfs_topic.head(2)

Unnamed: 0,Document,Topic,Name,Representation,Zephyr,Representative_Docs,Top_n_words,Probability,Representative_document,embedding
0,When I go to manage my reservation it shows Th...,-1,-1_to_the_and_for,"[to, the, and, for, on, my, in, is, flight, it]","Flight nightmares: Qatar Airways, GOL, TAP, an...","[End of March 2012 I made a trip to Manaus, Br...",to - the - and - for - on - my - in - is - fli...,0.0,False,"[0.081806935, -0.012164544, -0.033476178, -0.0..."
1,I sign into cname--.net to get into my emails....,3,3_cname_my_to_email,"[cname, my, to, email, for, the, can, get, is,...",CNAME Account Management Inquiries,[Is there any way I can still get my cname----...,cname - my - to - email - for - the - can - ge...,0.739251,False,"[-0.019243529, -0.0052302536, 0.013686666, 0.0..."


In [34]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Zephyr,Representative_Docs
0,-1,1951,-1_to_the_and_for,"[to, the, and, for, on, my, in, is, flight, it]","[\nFlight nightmares: Qatar Airways, GOL, TAP,...","[End of March 2012 I made a trip to Manaus, Br..."
1,0,442,0_my_email_password_account,"[my, email, password, account, sign, to, and, ...","[\n""Email/Account Login Issues"", , , , , , , ,...",[Why can I not sign in with name user name and...
2,1,435,1_train_from_to_station,"[train, from, to, station, the, ca, is, trip, ...","[\n""California Train Travel Inquiries"", , , , ...","[If I have a ticket for a 9 am train , can I u..."
3,2,426,2_to_the_and_from,"[to, the, and, from, in, with, we, of, for, on]","[\nAir Travel Experiences: Budget Airlines, Mi...",[DeI recently (June and July2010) travelled to...
4,3,259,3_cname_my_to_email,"[cname, my, to, email, for, the, can, get, is,...","[\n""CNAME Account Management Inquiries"", , , ,...",[Is there any way I can still get my cname----...
5,4,250,4_seats_seat_the_to,"[seats, seat, the, to, on, and, in, for, are, is]","[\nSeat Assignments and Refusal to Switch, , ,...","[Hi, Just wondered if specific seats can be bo..."
6,5,245,5_miles_mileage_my_plan,"[miles, mileage, my, plan, account, number, to...","[\nMileage Plan Account Management, , , , , , ...",[How long does it take for the miles to be cre...
7,6,241,6_bill_pay_my_payment,"[bill, pay, my, payment, it, auto, to, account...","[\n""Bill Payment Issues"", , , , , , , , , ]","[want to pay my bill online,, I'm trying to pa..."
8,7,233,7_luggage_bag_baggage_the,"[luggage, bag, baggage, the, to, in, and, on, ...",[\nLost Luggage and Hand Luggage Rules for Air...,"[Sorry, yet another question about hand luggag..."
9,8,220,8_internet_my_service_to,"[internet, my, service, to, is, phone, do, mod...","[\n""Internet Service Outages"", , , , , , , , , ]","[I can't get my internet., My internet is out,..."


In [4]:
#dfs_topic.to_pickle('embedding_L12_clustered_LLM.pkl')

pd.read_pickle("embedding_L12_clustered_LLM.pkl")["Zephyr"].unique()

array(['Flight nightmares: Qatar Airways, GOL, TAP, and Ebookers complaints',
       'CNAME Account Management Inquiries',
       'Air Travel Experiences: Budget Airlines, Missed Connections, Premium Economy, Customer Service, and Comfortable Seats',
       'Email/Account Login Issues',
       'Lost Luggage and Hand Luggage Rules for Airlines',
       'TV Troubleshooting: Cable Connection Issues & Online Streaming',
       'Bill Payment Issues', 'California Train Travel Inquiries',
       'Seat Assignments and Refusal to Switch',
       'Internet Service Outages',
       'Plane Reservations with Layovers and Delay Concerns',
       'Unreceived E-Tickets for Reservations',
       'Mileage Plan Account Management', 'Phone Number Inquiry',
       'Name Changes and Travel Documentation',
       'Flight Change Compensation Inquiries'], dtype=object)