# Library

In [1]:
import pandas as pd
import numpy as np
import gzip
import sys
sys.path.append('../ThreadAnalysis')
import PreProcessingText as ppt
from collections import Counter, defaultdict
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx
import matplotlib.pyplot as plt
import squarify
from transformers import pipeline
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired, PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT
from umap import UMAP
from sklearn.decomposition import PCA
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import csv
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.cluster import hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from itertools import combinations

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dommy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dommy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dommy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##

# Preprocess text to reduce time computing of SentenceTransformer

In [2]:
df = pd.read_csv('../merged_data.csv')
df = df.dropna(subset=['content'])
df = df.drop_duplicates(subset=['content'], keep='first')
df.shape[0]

270239

In [13]:
df['content'] = df['content'].apply(ppt.clean_sentences)

In [19]:
df.to_csv('content_clean_sentences.csv', index=False)
df_unique = pd.DataFrame(df['content'].unique())
df_unique.to_csv('PreProcessFiles/only_content_clean_sentences.csv', index=False, header=False)

In [20]:
df['content'] = df['content'].apply(ppt.preprocess_title)
df['content'] = df['content'].apply(ppt.remove_single_characters)
df.to_csv('PreProcessFiles/cleaned_data_name_content.csv', index=False)
df['content'].value_counts()

content
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

# Modelling

In [2]:
df = pd.read_csv('PreProcessFiles/cleaned_data_name_content.csv')
df = df.dropna(subset=['content'])
df = df.drop_duplicates(subset=['content'], keep='first')
df.shape[0]

261799

In [23]:
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'content')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)

2024-07-02 18:59:33,598 - PreProcessingText - INFO - Encoding the corpus. This might take a while.
Batches: 100%|██████████| 4079/4079 [2:52:54<00:00,  2.54s/it]  


array([[-0.05092696, -0.02883196,  0.03425763, ...,  0.00909158,
        -0.03866448, -0.04109244],
       [ 0.00060822,  0.06147481,  0.01782767, ..., -0.04401757,
        -0.02660749, -0.08558437],
       [-0.06397372, -0.040488  , -0.0253207 , ..., -0.05395703,
        -0.03535156,  0.05759883],
       ...,
       [-0.01766225, -0.01403396, -0.03574272, ..., -0.0159754 ,
         0.09160835,  0.05029548],
       [-0.13904728, -0.02408719, -0.02283162, ...,  0.04082799,
        -0.00130486,  0.04338759],
       [ 0.01306497,  0.04508202, -0.00048264, ..., -0.06037715,
         0.01438037, -0.04874933]], dtype=float32)

In [26]:
np.savez_compressed('PreProcessFiles/content_preprocessed_embeddings.npz', tc1.corpus_embeddings)
with gzip.open('PreProcessFiles/content_preprocessed_corpus.txt.gz', 'wt') as f:
    for word in tc1.corpus:
        f.write(word + '\n')

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
with np.load('PreProcessFiles/content_preprocessed_embeddings.npz') as data:
    embeddings = data['arr_0']

with gzip.open('PreProcessFiles/content_preprocessed_corpus.txt.gz', 'rt') as f:
    corpus = f.read().split('\n')

corpus.pop()

''

In [6]:
len(corpus), embeddings.shape

(260996, (260996, 384))

In [11]:
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=120, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=[mmr, kw],
    embedding_model=model,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(corpus, embeddings)

2024-07-03 00:34:47,761 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-03 00:46:44,243 - BERTopic - Dimensionality - Completed ✓
2024-07-03 00:46:44,289 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-03 00:58:03,943 - BERTopic - Cluster - Completed ✓
2024-07-03 00:58:04,290 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-03 01:00:26,745 - BERTopic - Representation - Completed ✓


In [15]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(corpus, vectorizer_model=vectorizer_model)

In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,126693,-1_scam_scammer_scamming_scammed,"[scam, scammer, scamming, scammed, read, legit...",[hello starter around year long enough spend g...
1,0,13906,0_parcel_postal_delivery_postage,"[parcel, postal, delivery, postage, shipment, ...",[would love get everyone opinion situation goo...
2,1,12754,1_qualitydescription_processed_cart_carts,"[qualitydescription, processed, cart, carts, p...",[dobymick blue dream hp gram general informati...
3,2,5150,2_deposit_depositing_transaction_withdraw,"[deposit, depositing, transaction, withdraw, p...",[generated btc deposit address sent btc said a...
4,3,4581,3_tailsdataunlocked_tails_tor_linux,"[tailsdataunlocked, tails, tor, linux, daemon,...",[forensic analyst attempt grab copy memory mac...
...,...,...,...,...,...
170,169,128,169_qps_qp_qpsp_hp,"[qps, qp, qpsp, hp, halfp, qwiso, qld, price, ...","[please put hp qp listing love try stuff, wond..."
171,170,127,170_oculus_oculusfoundation_oculusmadeitrain_o...,"[oculus, oculusfoundation, oculusmadeitrain, o...",[proud announce another oculus innovation also...
172,171,127,171_title_titletitle_titletitletitle_titleeeee...,"[title, titletitle, titletitletitle, titleeeee...","[title, title post properly title submission r..."
173,172,125,172_felon_gangster_drug_cartel,"[felon, gangster, drug, cartel, incarcerating,...",[ahh ok well start let start freshest im year ...


In [18]:
len(corpus)

260996

In [26]:
umap_embeddings = topic_model.umap_model.fit_transform(embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")

topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

silhouette_score: 0.5382845401763916
Davies_bouldin_score: 0.4535453977050335
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\dommy\miniconda3\envs\gestione\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\dommy\AppData\Local\Temp\ipykernel_11456\1650485521.py", line 19, in <module>
    texts=[doc.split() for doc in tc1.corpus],
                                  ^^^
NameError: name 'tc1' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\dommy\miniconda3\envs\gestione\Lib\site-packages\IPython\core\interactiveshell.py", line 2168, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dommy\miniconda3\envs\gestione\Lib\site-packages\IPython\core\ultratb.py", line 1454, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\

In [28]:
topn = 10
topic_list = []
for topic in topics_ll:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")

Coherence Model: 0.6992133725013738


In [29]:
new_topics = topic_model.reduce_outliers(corpus, topics, strategy="embeddings", embeddings=embeddings, threshold=0.45)
topic_model.update_topics(corpus, topics=new_topics)
topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,86296,-1_nt_like_vendor_get,"[nt, like, vendor, get, would, order, time, li...",[hello starter around year long enough spend g...
1,0,14183,0_order_pack_package_tracking,"[order, pack, package, tracking, day, shipped,...",[would love get everyone opinion situation goo...
2,1,12797,1_cart_strain_weed_bud,"[cart, strain, weed, bud, thc, price, quality,...",[dobymick blue dream hp gram general informati...
3,2,5252,2_deposit_ticket_address_deposited,"[deposit, ticket, address, deposited, btc, wal...",[generated btc deposit address sent btc said a...
4,3,4629,3_tor_vpn_tails_tail,"[tor, vpn, tails, tail, browser, use, network,...",[forensic analyst attempt grab copy memory mac...
...,...,...,...,...,...
170,169,140,169_qp_pound_lb_qps,"[qp, pound, lb, qps, hp, price, pound listing,...","[please put hp qp listing love try stuff, wond..."
171,170,129,170_oculus_cum_service_link oculus,"[oculus, cum, service, link oculus, oculuscum,...",[proud announce another oculus innovation also...
172,171,152,171_title_title say_title title_title post,"[title, title say, title title, title post, ti...","[title, title post properly title submission r..."
173,172,222,172_gun_drug_people_life,"[gun, drug, people, life, law, war, want, get,...",[ahh ok well start let start freshest im year ...


In [24]:
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))

Topic 0:
[('order', 0.012230398910700494), ('pack', 0.010926149694411306), ('package', 0.009717355630684089), ('tracking', 0.00872143841139465), ('day', 0.008697903367101512), ('shipped', 0.007970952038091048), ('mail', 0.007591028342236136), ('ordered', 0.007209466890303761), ('week', 0.006676247641594445), ('delivery', 0.005835522175458815)]
Topic 1:
[('cart', 0.008569278374769541), ('strain', 0.007863303815897884), ('weed', 0.007785989330533793), ('bud', 0.006611821007435437), ('thc', 0.00550308415138524), ('price', 0.005489287491820493), ('quality', 0.005303084516353841), ('product', 0.005274480488836624), ('cannabis', 0.005171767864728047), ('shipping', 0.0049530983328878835)]
Topic 2:
[('deposit', 0.03949472793916253), ('ticket', 0.01712816725946162), ('address', 0.017034079574471877), ('deposited', 0.014848693146027754), ('btc', 0.01469166282116177), ('wallet', 0.013707725568431842), ('withdraw', 0.012806762676899822), ('withdrawal', 0.011433933301536562), ('support', 0.00936055

In [30]:
topic_model.visualize_topics()

In [32]:
umap_embeddings = topic_model.umap_model.fit_transform(embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")


topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics_ll:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")

silhouette_score: 0.28787171840667725
Davies_bouldin_score: 1.880820756428959
Coherence Model: 0.6981655063757025


In [33]:
def calculate_dos(topic_words, top_n=10):
    """
    Calculate the average overlap score for all pairs of topics.
    :param topic_words: The topic words
    :param top_n: The number of words to consider for each topic
    :return: The average overlap score
    """
    overlap = 0
    num_combinations = 0
    for topic1, topic2 in combinations(topic_words.values(), 2):
        words1 = set([word for word, _ in topic1[:top_n]])
        words2 = set([word for word, _ in topic2[:top_n]])
        overlap += len(words1.intersection(words2))
        num_combinations += 1
    return overlap / num_combinations

In [34]:
dos_score = calculate_dos(topic_model.get_topics())
print(f"Distinta Overlap Score: {dos_score}")

Distinta Overlap Score: 0.12564860426929392


In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [189]:
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)

Assigning labels to topics:   0%|          | 0/68 [00:00<?, ?it/s]

Assigning labels to topics: 100%|██████████| 68/68 [42:46<00:00, 37.74s/it] 


In [232]:
dict_zero_shots_25 = pd.read_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_150_20n/zero_shot_025.csv').set_index('Topic')['Labels'].to_dict()

In [None]:
dict_zero_shots_25[1] = 'ask help - ask help post'
dict_zero_shots_25[2] = 'order'
dict_zero_shots_25[10] = 'crosspost vendor'
dict_zero_shots_25[13] = 'dream market - dread'
dict_zero_shots_25[14] = 'ask help - moderator'
dict_zero_shots_25[15] = 'cocaine vendor - cocaine'
dict_zero_shots_25[20] = 'bunk bar'
dict_zero_shots_25[28] = 'wsm vendor - wsm market'
dict_zero_shots_25[39] = 'cryptonia market - dread'
dict_zero_shots_25[41] = 'escrow service'
dict_zero_shots_25[48] = 'event happened'
dict_zero_shots_25[51] = 'dream - dream vendor - dread'
dict_zero_shots_25[52] = 'order cancelled'
dict_zero_shots_25[57] = 'opsec questions'
dict_zero_shots_25[58] = 'link'
dict_zero_shots_25[59] = 'mirror - mirror link - working mirror'
dict_zero_shots_25[64] = 'capcha'

In [None]:
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_150_20n/zero_shot_025.csv', index=False)

In [235]:
topic_model.set_topic_labels(dict_zero_shots_25)

In [236]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True, custom_labels=True)

In [35]:
topic_model.visualize_barchart(top_n_topics=250, custom_labels=True, n_words=10)

In [36]:
topic_model.visualize_hierarchy(custom_labels=True)

In [37]:
topic_model.visualize_heatmap(custom_labels=True)

In [239]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)

15it [00:22,  1.52s/it]


In [240]:
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
print(results_final.shape)
results_final.head()

(46629, 10)


Unnamed: 0,Document,Embedding,Topic,Probability,Created_on,Count,Name,CustomName,Representation,Representative_Docs
0,naturalmeds apollon,"[-0.11246682, -0.03748099, 0.04490077, 0.01724...",56,"[1.1964202652021368e-307, 6.589650224023648e-3...",2020-01-09,242,56_apollon_apollon market_market_apollon apollon,market - apollon - apollon market,"[apollon, apollon market, market, apollon apol...","[back apollon, apollon, apollon market]"
1,redemption btc giveaway coke sub dread member ...,"[-0.043700494, -0.032600075, 0.0051953266, 0.0...",24,"[0.006596099708564405, 0.003763170646085399, 0...",2020-01-09,544,24_coke_coke vendor_best coke_uk coke,cocaine,"[coke, coke vendor, best coke, uk coke, uk, go...","[fire coke, coke vendor , coke vendor]"
2,flubromazolam sample giveaway,"[-0.04101017, 0.007629349, -0.07528322, -0.033...",62,"[8.335712654832696e-308, 7.430095287264378e-30...",2019-11-06,290,62_alprazolam_powder_alprazolam powder_flualpr...,powder - alprazolam - alprazolam powder,"[alprazolam, powder, alprazolam powder, flualp...",[usa domestic alprazolam powder mxe apvp inbom...
3,cigarette tobacco replica,"[-0.07527819, 0.13146353, -0.07912154, -0.0353...",0,"[0.040079176553298505, 0.007976173889767435, 0...",2020-01-09,5021,0_weed_cannabis_cart_thc,marijuana,"[weed, cannabis, cart, thc, review, hash, shat...",[product vendor review ml lemon kush wax vape ...
4,requiring image image review,"[-0.014864997, 0.08211257, 0.004136639, 0.0027...",16,"[0.006753134223967767, 0.007309909359113744, 0...",2019-11-06,998,16_review_vendor review_review vendor_vendor,reviews vendor,"[review, vendor review, review vendor, vendor,...",[xpost danknation vendor review sunaero multis...


In [241]:
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_150_20n.parquet')

In [None]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_150_20n", serialization="pickle", save_ctfidf=True, save_embedding_model=model)

In [243]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_150_20n_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=model)

In [69]:
topic_model = BERTopic.load("Models/topic_model_all-MiniLM-L6-v2_150_20n")

In [244]:
sentence = ['recently closed Samsara market']
tp, pr = topic_model.transform(sentence)

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
2024-06-30 21:54:03,236 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-06-30 21:54:08,543 - BERTopic - Dimensionality - Completed ✓
2024-06-30 21:54:08,544 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-06-30 21:54:08,576 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2024-06-30 21:54:08,699 - BERTopic - Probabilities - Completed ✓
2024-06-30 21:54:08,701 - BERTopic - Cluster - Completed ✓


In [248]:
top_indices = np.argsort(pr[0])[::-1][:5]
top_topics = [(topic_model.get_topic(i), pr[0][i], topic_model.custom_labels_[i+1]) for i in top_indices]
df_finals = pd.DataFrame(top_topics, columns=['Topic', 'Probability', 'Label'])
df_finals['Words'] = df_finals['Topic'].apply(lambda topic: [word for word, prob in topic])
df_finals['Sentence'] = sentence * len(df_finals)
df_finals

Unnamed: 0,Topic,Probability,Label,Words,Sentence
0,"[(cryptonia, 0.270593329016721), (cryptonia ma...",0.002991,cryptonia market - dread,"[cryptonia, cryptonia market, cryptonia crypto...",recently closed Samsara market
1,"[(empire, 0.1408905571475779), (empire market,...",0.002262,empire market,"[empire, empire market, empire empire, market,...",recently closed Samsara market
2,"[(wallstreet, 0.17075397467717646), (wall, 0.1...",0.002127,wallstreet,"[wallstreet, wall, wall street, street, wall s...",recently closed Samsara market
3,"[(scammer, 0.09029491748260987), (scam, 0.0879...",0.002075,fraud - scammer,"[scammer, scam, exit, scamming, scammed, exit ...",recently closed Samsara market
4,"[(crosspost, 0.14470709618860572), (giveaway, ...",0.002045,crosspost vendor,"[crosspost, giveaway, review crosspost, crossp...",recently closed Samsara market


In [109]:
topic_model.get_topic(tp[0])

[('anyone', 0.009415205712082564),
 ('update', 0.007916840604830654),
 ('address', 0.006939392479966835),
 ('new', 0.006268540212828576),
 ('get', 0.006179372827051399),
 ('vacation', 0.0059803996923821),
 ('has', 0.0058347636749867746),
 ('need', 0.005401699096715211),
 ('drop', 0.005394598005495695),
 ('listing', 0.005367223382048893)]