# Basic

In [5]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import joblib
from typing import List, Dict


# Comprehensiveness Test

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

import re

def clean_text(text):
    # Remove escape sequences
    text = re.sub(r'\\[a-zA-Z]', ' ', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Example usage:
text = "Hello, this is an example text with an email address john@example.com. \nPlease remove punctuation and escape sequences, including \\n."
cleaned_text = clean_text(text)
print(cleaned_text)


from tqdm import tqdm

docs = []
for text in tqdm(newsgroups_train["data"]):
  docs.append(clean_text(text))

Hello this is an example text with an email address Please remove punctuation and escape sequences including


100%|██████████| 11314/11314 [00:00<00:00, 12594.37it/s]


In [3]:
docs = []
count = 0
for text in tqdm(newsgroups_train["data"]):
  count+=1
  docs.append(clean_text(text) + f" c_{count}")

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()



100%|██████████| 11314/11314 [00:00<00:00, 12252.19it/s]


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3577,-1_the_to_of_for,"[the, to, of, for, and, in, from, it, is, you]",[From Michael Siemon Subject Re Christian meta...
1,0,507,0_he_year_baseball_team,"[he, year, baseball, team, game, runs, braves,...",[From Bruce DoppleAckers Anonymous Hasch Subje...
2,1,311,1_clipper_encryption_key_chip,"[clipper, encryption, key, chip, escrow, keys,...",[From Clipper Chip Announcement Subject text o...
3,2,282,2_mail_wiring_list_wire,"[mail, wiring, list, wire, keyboard, widget, f...",[From Subject Electrical wiring FAQ was A ques...
4,3,222,3_team_game_flyers_leafs,"[team, game, flyers, leafs, he, play, hockey, ...",[From peter r clark jr Subject FLYERS notes 4 ...
...,...,...,...,...,...
220,219,11,219_contradictions_books_list_archer,"[contradictions, books, list, archer, biblical...",[From Brian Kendig Subject Re The list of Bibl...
221,220,11,220_meter_fluke_dmm_beckman,"[meter, fluke, dmm, beckman, platinum, cheapie...",[From Peter Tryndoch Subject Dmm Advice Needed...
222,221,11,221_prayers_prayer_jesus_god,"[prayers, prayer, jesus, god, husband, pray, o...",[From Jayne Kulikauskas Subject Re Can sin blo...
223,222,10,222_god_jesus_law_jews,"[god, jesus, law, jews, malcolm, laws, chosen,...",[From Kent Sandvik Subject Re A KIND and LOVIN...


In [4]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

In [5]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3731,-1_the_to_and_for,"[the, to, and, for, of, in, is, it, you, from]",[From David Davidian Subject Accounts of Anti ...
1,0,511,0_he_year_baseball_team,"[he, year, baseball, team, game, runs, braves,...",[Subject Re Eck vs Rickey was Re Rickey s whin...
2,1,362,1_gun_guns_firearms_weapons,"[gun, guns, firearms, weapons, militia, contro...",[From Dillon Pyron Subject Re My Gun is like m...
3,2,335,2_clipper_encryption_key_chip,"[clipper, encryption, key, chip, escrow, keys,...",[Subject text of White House announcement and ...
4,3,217,3_fbi_batf_fire_compound,"[fbi, batf, fire, compound, atf, waco, koresh,...",[From Jim De Arras Subject Re BATF FBI Murders...
...,...,...,...,...,...
216,215,11,215_lights_backing_dumbest_automotive,"[lights, backing, dumbest, automotive, reverse...",[From Charles Parr Subject Re Dumbest automoti...
217,216,11,216_witnesses_trial_gm_new,"[witnesses, trial, gm, new, judge, jury, justi...",[From Ted Frank Subject Re The state of justic...
218,217,10,217_wbt_justifiable_wycliffe_translators,"[wbt, justifiable, wycliffe, translators, war,...",[From catherine c bareiss Subject Re phone num...
219,218,10,218_fractal_fractals_compression_iterated,"[fractal, fractals, compression, iterated, com...",[From I Rachmat Subject Fractal compression Su...


In [4]:

def remove_word_from_list(word_to_remove, string_list):
    return [string.replace(word_to_remove, '') for string in string_list]

In [7]:
docs = []
count = 0
for text in tqdm(newsgroups_train["data"]):
  count+=1
  docs.append(clean_text(text) + f" c_{count}")

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()


100%|██████████| 11314/11314 [00:00<00:00, 11671.07it/s]


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4112,-1_the_to_and_for,"[the, to, and, for, of, in, is, from, it, you]",[From Jonas Flygare Subject Re 18 Israelis mur...
1,0,321,0_clipper_encryption_key_chip,"[clipper, encryption, key, chip, escrow, keys,...",[From Clipper Chip Announcement Subject text o...
2,1,199,1_scsi_drive_ide_controller,"[scsi, drive, ide, controller, drives, disk, b...",[From Wayne Smith Subject Re IDE vs SCSI Organ...
3,2,196,2_fbi_batf_fire_compound,"[fbi, batf, fire, compound, atf, koresh, waco,...",[From Jim De Arras Subject Re BATF FBI Murders...
4,3,106,3_car_cars_integra_convertible,"[car, cars, integra, convertible, engine, wago...",[From Chuck Kesler Subject Re Ford Probe Opini...
...,...,...,...,...,...
232,231,10,231_xman_source_9000_xrn,"[xman, source, 9000, xrn, rao, hp, xarchie, ba...",[From Chun Hung Lin Subject Re xman source Nnt...
233,232,10,232_nhl_stars_hockey_halifax,"[nhl, stars, hockey, halifax, names, team, div...",[From Karim Edvard Ahmed Subject Re Truly a sa...
234,233,10,233_wbt_wycliffe_ideological_translators,"[wbt, wycliffe, ideological, translators, deno...",[From catherine c bareiss Subject Re phone num...
235,234,10,234_ir_cycle_xxxxx_wazing,"[ir, cycle, xxxxx, wazing, emitting, detector,...",[From David Prutchi Subject Re Long distance I...


In [8]:
df = pd.DataFrame({"Document": docs, "Topic": topics})
df.head()

Unnamed: 0,Document,Topic
0,From where s my thing Subject WHAT car is this...,235
1,From Guy Kuo Subject SI Clock Poll Final Call ...,205
2,From Thomas E Willis Subject PB questions Orga...,-1
3,From Joe Green Subject Re Weitek P9000 Organiz...,-1
4,From Jonathan McDowell Subject Re Shuttle Laun...,-1


In [30]:
import pandas as pd

def compare_topics(df1, df2,topic_num):
    ## 1/ Count the number of elements that changed between the two dataframes in Topic column
    # Select only the "Topic" column from each dataframe
    topics1 = df1["Topic"]
    topics2 = df2["Topic"]
    # Compare the two columns and count the number of changes
    changes = (topics1 != topics2).sum()
    # Count the number of elements that remained the same
    same = (topics1 == topics2).sum()

    ## 2/ Count the number of elements that changed to -1 or noise from "topic_num" topic. 
    # Find rows where "Topic" changed from a non-negative value to -1
    changed_rows = (df1['Topic'] == topic_num) & (df2['Topic'] == -1)
    changed_rows_2 = (df1['Topic'] >= 0) & (df2['Topic'] == -1)
    # Extract the rows that satisfy the condition
    changed_rows_df = df1[changed_rows]
    changed_rows_df_2 = df1[changed_rows_2]
    # Get the number of rows that changed
    top2noise = len(changed_rows_df)
    all2noise = len(changed_rows_df_2)

    ## 3/ Check number of changes in topic constricted to "topic_num" topic. 
    # Select rows in df1 where "Topic" is equal to the given topic
    rows_with_given_topic_df1 = df1[df1['Topic'] == topic_num]
    # Find the corresponding rows in df2
    corresponding_rows_df2 = df2.loc[rows_with_given_topic_df1.index]
    # Count the number of rows where the "Topic" value changed
    num_changed_rows = (rows_with_given_topic_df1['Topic'] != corresponding_rows_df2['Topic']).sum()
    # Count the number of rows where the "Topic" value remained the same
    num_same_rows = (rows_with_given_topic_df1['Topic'] == corresponding_rows_df2['Topic']).sum()


    results = {
        "total_changes" : changes, 
        "total_same" : same,
        "topic_to_noise" : top2noise,
        "all_to_noise" : all2noise,
        "topic_change" : num_changed_rows,
        "topic_same" : num_same_rows,
    }

    return results


df1 = pd.DataFrame({"Topic" : [1, 2, 3, 4, 5]})
df2 = pd.DataFrame({"Topic" : [-1, 2, 3, 4, -1]})

compare_topics(df1, df2, 5)




{'total_changes': 2,
 'total_same': 3,
 'topic_to_noise': 1,
 'all_to_noise': 2,
 'topic_change': 1,
 'topic_same': 0}

In [15]:


def remove_word_from_list(word_to_remove, string_list):
    return [string.replace(word_to_remove, '') for string in string_list]

def check_topic_for_comprehensiveness_fit( docs: List[str], k: int) -> Dict[str, str]:
    """
    Take as input a list of topics and the initial documents, perturbs the documents
    by removing one topic word after another and repeats the modeling to find if the
    topic changes.
    """
    ablation_mappings = {}
    anchor_topic_model = BERTopic()
    topics, probs = anchor_topic_model.fit_transform(docs)
    topic_list = anchor_topic_model.get_topic_info()["Representation"]

    # forming doc -> topic pairing
    df_basic_mapping = pd.DataFrame({"Document": docs, "Topic": topics})


    for word in tqdm(topic_list[k+1]):
        new_docs = remove_word_from_list(word, docs)
        new_topics, probs = anchor_topic_model.transform(new_docs)
        df_new_mapping = pd.DataFrame({"Document": docs, "Topic": new_topics})

        ablation_mappings[word] =  compare_topics(df_basic_mapping, df_new_mapping,k)

    return ablation_mappings

def save_topic_for_comprehensiveness_fit( docs: List[str], k: int) -> pd.DataFrame:
    """
    Take as input a list of topics and the initial documents, perturbs the documents
    by removing one topic word after another and repeats the modeling to find if the
    topic changes.
    """
    ablation_mappings = {}
    anchor_topic_model = BERTopic()
    topics, probs = anchor_topic_model.fit_transform(docs)
    topic_list = anchor_topic_model.get_topic_info()#["Representation"]
    c_tf_idf_mappings = anchor_topic_model.topic_representations_

    # forming doc -> topic pairing
    df_basic_mapping = pd.DataFrame({"Document": docs, "Topic": topics})


    for word in tqdm(topic_list[k+1]):
        new_docs = remove_word_from_list(word, docs)
        new_topics, probs = anchor_topic_model.transform(new_docs)
        df_new_mapping = pd.DataFrame({"Document": docs, "Topic": new_topics})

        ablation_mappings[word] =  df_new_mapping

    return ablation_mappings,c_tf_idf_mappings,df_basic_mapping,topic_list

def save_topic_for_comprehensiveness_fit_cumulative(docs: List[str], k: int) -> pd.DataFrame:
    """
    Take as input a list of topics and the initial documents, perturbs the documents
    by removing one topic word after another and repeats the modeling to find if the
    topic changes.
    """
    ablation_mappings = {}
    anchor_topic_model = BERTopic()
    topics, probs = anchor_topic_model.fit_transform(docs)
    topic_list = anchor_topic_model.get_topic_info()["Representation"]
    c_tf_idf_mappings = anchor_topic_model.topic_representations_

    # forming doc -> topic pairing
    df_basic_mapping = pd.DataFrame({"Document": docs, "Topic": topics})

    new_docs = docs # Initialize new_docs with the original documents

    for  word in tqdm(topic_list[k+1]): 
        new_docs = remove_word_from_list(word, new_docs)

        new_topics, probs = anchor_topic_model.transform(new_docs)
        df_new_mapping = pd.DataFrame({"Document": new_docs, "Topic": new_topics})

        ablation_mappings[word] = df_new_mapping

    return ablation_mappings, c_tf_idf_mappings, df_basic_mapping, topic_list

In [16]:
ablation_mappings_cumulative,c_tf_idf_mappings,df_basic_mapping,topic_list = save_topic_for_comprehensiveness_fit_cumulative(docs,2)

100%|██████████| 10/10 [02:04<00:00, 12.45s/it]


In [21]:
topic_list

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3679,-1_to_the_and_of,"[to, the, and, of, for, in, is, it, you, from]",[From Brian Ceccarelli 602 621 9615 Subject Re...
1,0,511,0_he_year_baseball_team,"[he, year, baseball, team, game, runs, braves,...",[Subject Re Eck vs Rickey was Re Rickey s whin...
2,1,289,1_clipper_key_chip_encryption,"[clipper, key, chip, encryption, escrow, keys,...",[From Pat Myrto Subject Re Once tapped your co...
3,2,240,2_gun_guns_firearms_militia,"[gun, guns, firearms, militia, weapons, amendm...",[From C D Tavares Subject Re My Gun is like my...
4,3,195,3_tobacco_health_mail_address,"[tobacco, health, mail, address, smokeless, ni...",[Subject Space FAQ 02 15 Network Resources Fro...
...,...,...,...,...,...
221,220,10,220_software_level_process_challenger,"[software, level, process, challenger, shuttle...",[From fred j mccall 575 3539 Subject Re Level ...
222,221,10,221_solar_sail_sails_accelerations,"[solar, sail, sails, accelerations, 45g, proje...",[From Bill Higgins Beam Jockey Subject Re Sola...
223,222,10,222_server_x11_xwd_xdm,"[server, x11, xwd, xdm, usr, pid, sigpipe, ram...",[From S Ramakrishnan Subject Re Mwm title drag...
224,223,10,223_appcontext_application_callback_huub,"[appcontext, application, callback, huub, xtap...",[From Patrick L Mahan Subject Re How do I find...


In [22]:
df_basic_mapping

Unnamed: 0,Document,Topic
0,From where s my thing Subject WHAT car is this...,-1
1,From Guy Kuo Subject SI Clock Poll Final Call ...,193
2,From Thomas E Willis Subject PB questions Orga...,20
3,From Joe Green Subject Re Weitek P9000 Organiz...,-1
4,From Jonathan McDowell Subject Re Shuttle Laun...,220
...,...,...
11309,From Jim Zisfein Subject Re Migraines and scan...,17
11310,From Subject Screen Death Mac Plus 512 Lines 2...,54
11311,From Will Estes Subject Mounting CPU Cooler in...,-1
11312,From Steven Collins Subject Re Sphere from 4 p...,10


In [24]:
topic_list.to_csv("/home/abpal/WorkFiles/Results/Comprehensiveness_Raw/Base_Results/base.csv")

In [25]:
df_basic_mapping.to_csv("/home/abpal/WorkFiles/Results/Comprehensiveness_Raw/Base_Results/df_basic_mapping.csv")

In [27]:
import json

# Specify the file path where you want to save the JSON file
file_path = "/home/abpal/WorkFiles/Results/Comprehensiveness_Raw/Base_Results/c_tf_idf_mappings.json"

# Open the file in write mode and use json.dump to write the dictionary to the file
with open(file_path, 'w') as json_file:
    json.dump(c_tf_idf_mappings, json_file)



# Centroid Test

In [11]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939


In [12]:
# distance of each sentence from the centroid
centroid = embeddings1.mean(axis=0)
for i in range(len(embeddings1)):
    centriod_dist = util.cos_sim(centroid,embeddings1[i])
    print(centriod_dist)

tensor([[0.5800]], device='cuda:0')
tensor([[0.5995]], device='cuda:0')
tensor([[0.5645]], device='cuda:0')


In [15]:
import torch

tensor = torch.randn(3, 2)
row_to_remove = 0

new_tensor = torch.cat((tensor[:row_to_remove], tensor[row_to_remove+1:]))


In [16]:
import numpy as np 

# movement of centroid for the absence of each sentence
centroid_base = embeddings1.mean(axis=0).cpu()
for i in range(len(embeddings1)):
    embeddings_new = torch.cat((embeddings1[:i], embeddings1[i+1:]))
    new_centroid = embeddings_new.mean(axis=0).cpu()
    centriod_movt =  np.linalg.norm(centroid_base - new_centroid)

    print(centriod_movt)

0.40729418
0.40028247
0.41278866


In [None]:
import torch

def centroid_tests(topic_list):
    centroid_distance = {}
    centroid_movement = {}

    #find embeddings
    embeddings = model.encode(topic_list, convert_to_tensor=True)
    centroid = embeddings.mean(axis=0)

    # distance of each sentence from the centroid
    for i in range(len(embeddings)):
        centriod_dist = util.cos_sim(centroid,embeddings[i])
        centroid_distance[topic_list[i]] = float(centriod_dist[0,0])
    
    # movement of centroid for the absence of each sentence
    for i in range(len(embeddings)):
        embeddings_new = torch.cat((embeddings[:i], embeddings[i+1:]))
        new_centroid = embeddings_new.mean(axis=0).cpu()
        centriod_movt =  np.linalg.norm(centroid.cpu() - new_centroid)
        centroid_movement[topic_list[i]] = float(centriod_movt)

    return {"centroid_distance":centroid_distance , "centroid_movement":centroid_movement}
        
# centroid_tests(list(ablation_mappings.keys()))

# Comprehensiveness + Centroid Test : 

In [None]:
k = 200
ablation_top_k_topics = {}
centroid_test_results = {}

for topic_i in tqdm(range(100,k)):
    ablation_top_k_topics[f"Topic_{topic_i}"] = check_topic_for_comprehensiveness_fit(docs,topic_i)
print("========Comprehensiveness Ablation Tests done========")

In [None]:
k = 100
ablation_top_k_topics_RAW = {}
centroid_test_results = {}

for topic_i in tqdm(range(k)):
    ablation_top_k_topics_RAW[f"Topic_{topic_i}"] = save_topic_for_comprehensiveness_fit(docs,topic_i)
print("========Comprehensiveness Ablation Tests done========")

In [17]:
k = 20
ablation_top_k_topics_RAW = {}
centroid_test_results = {}

for topic_i in tqdm(range(k)):
    ablation_top_k_topics_RAW[f"Topic_{topic_i}"],c_tf_idf_mappings,df_basic_mapping,topic_list = save_topic_for_comprehensiveness_fit_cumulative(docs,topic_i)
print("========Comprehensiveness Ablation Tests done========")

100%|██████████| 10/10 [01:57<00:00, 11.71s/it]
100%|██████████| 10/10 [01:56<00:00, 11.66s/it]
100%|██████████| 10/10 [01:58<00:00, 11.82s/it]
100%|██████████| 10/10 [01:58<00:00, 11.86s/it]
100%|██████████| 10/10 [01:58<00:00, 11.80s/it]
100%|██████████| 10/10 [01:59<00:00, 11.94s/it]
100%|██████████| 10/10 [01:59<00:00, 11.93s/it]
100%|██████████| 10/10 [01:59<00:00, 11.96s/it]
100%|██████████| 10/10 [02:00<00:00, 12.05s/it]
100%|██████████| 10/10 [01:59<00:00, 11.98s/it]
100%|██████████| 10/10 [01:59<00:00, 11.97s/it]]
100%|██████████| 10/10 [02:01<00:00, 12.10s/it]]
100%|██████████| 10/10 [01:59<00:00, 11.99s/it]]
100%|██████████| 10/10 [02:01<00:00, 12.13s/it]]
100%|██████████| 10/10 [02:00<00:00, 12.04s/it]]
100%|██████████| 10/10 [02:01<00:00, 12.12s/it]]
100%|██████████| 10/10 [02:02<00:00, 12.23s/it]]
100%|██████████| 10/10 [02:02<00:00, 12.24s/it]]
100%|██████████| 10/10 [02:00<00:00, 12.09s/it]]
100%|██████████| 10/10 [02:01<00:00, 12.10s/it]]
100%|██████████| 20/20 [45:19<






In [39]:
topic_list

0         [the, to, and, for, of, in, is, from, it, you]
1      [gun, guns, firearms, weapons, militia, contro...
2      [clipper, encryption, key, chip, escrow, keys,...
3      [nist, address, mail, ncsl, japanese, computer...
4      [moon, lunar, billion, space, mining, prize, h...
                             ...                        
237    [diamond, cursor, lindbergh, 1024x768x256, dri...
238    [aids, god, punishment, sin, jesus, atonement,...
239    [eisa, isa, motherboard, bus, 16mb, 486, schau...
240    [analog, mf, sampling, digital, 5v, signal, sw...
241    [book, scrolls, butera, sea, books, religions,...
Name: Representation, Length: 242, dtype: object

In [19]:
ablation_top_k_topics_RAW["Topic_0"]["algorithm"]

Unnamed: 0,Document,Topic
0,From where s my thing Subject WHAT car is this...,5
1,From Guy Kuo Subject SI Clock Poll Final Call ...,-1
2,From Thomas E Willis Subject PB questions Orga...,-1
3,From Joe Green Subject Re Weitek P9000 Organiz...,-1
4,From Jonathan McDowell Subject Re Shuttle Laun...,-1
...,...,...
11309,From Jim Zisfein Subject Re Migraines and scan...,6
11310,From Subject Screen Death Mac Plus 512 Lines 2...,-1
11311,From Will Estes Subject Mounting CPU Cooler in...,-1
11312,From Steven Collins Subject Re Sphere from 4 p...,78


In [21]:
import os 
def save_comprehensiveness_raw(data,path):
    for topic in range(20):
        top_data = data[f"Topic_{topic}"]
        try: 
            os.mkdir(path+f"/Topic_{topic}") 
        except : 
            pass
        for word in top_data.keys():
            top_data[word].to_csv(path_or_buf=path+f"/Topic_{topic}/{word}.csv",
                                  columns=["Topic"],
                                  index=False,
                                  header=False
                                  )

save_comprehensiveness_raw(ablation_top_k_topics_RAW,"/home/abpal/WorkFiles/Results/Comprehensiveness_Raw_Cumulative")

In [22]:
import os
import pandas as pd

def load_comprehensiveness_raw(path):
    loaded_data = {}
    for topic in range(100):
        topic_path = os.path.join(path, f"Topic_{topic}")
        if os.path.exists(topic_path):
            topic_data = {}
            for file_name in os.listdir(topic_path):
                word = os.path.splitext(file_name)[0]
                file_path = os.path.join(topic_path, file_name)
                topic_df = pd.read_csv(file_path, names=["Topic"])
                topic_data[word] = topic_df
            loaded_data[f"Topic_{topic}"] = topic_data
    return loaded_data

# Example usage:
loaded_data = load_comprehensiveness_raw("/home/abpal/WorkFiles/Results/Comprehensiveness_Raw_Cumulative")


In [24]:
loaded_data["Topic_0"]["algorithm"]

Unnamed: 0,Topic
0,5
1,-1
2,-1
3,-1
4,-1
...,...
11309,6
11310,-1
11311,-1
11312,78


In [37]:
df_basic_mapping 

results = {}

for topic_i in loaded_data.keys():
    results[topic_i] = {}
    for words in loaded_data[topic_i].keys():
        results[topic_i][words] = compare_topics(df_basic_mapping,loaded_data[topic_i][words],int(topic_i.split("_")[-1]))

In [38]:
results["Topic_0"]

{'chip': {'total_changes': 8075,
  'total_same': 3239,
  'topic_to_noise': 179,
  'all_to_noise': 1934,
  'topic_change': 369,
  'topic_same': 0},
 'key': {'total_changes': 8104,
  'total_same': 3210,
  'topic_to_noise': 163,
  'all_to_noise': 1862,
  'topic_change': 369,
  'topic_same': 0},
 'encryption': {'total_changes': 8074,
  'total_same': 3240,
  'topic_to_noise': 163,
  'all_to_noise': 1798,
  'topic_change': 369,
  'topic_same': 0},
 'nsa': {'total_changes': 8069,
  'total_same': 3245,
  'topic_to_noise': 185,
  'all_to_noise': 1872,
  'topic_change': 369,
  'topic_same': 0},
 'keys': {'total_changes': 8097,
  'total_same': 3217,
  'topic_to_noise': 176,
  'all_to_noise': 1883,
  'topic_change': 369,
  'topic_same': 0},
 'escrow': {'total_changes': 8104,
  'total_same': 3210,
  'topic_to_noise': 170,
  'all_to_noise': 1895,
  'topic_change': 369,
  'topic_same': 0},
 'government': {'total_changes': 8079,
  'total_same': 3235,
  'topic_to_noise': 178,
  'all_to_noise': 1984,
  

In [41]:
import json
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)
    
# Specify the file path where you want to save the JSON file
file_path = "/home/abpal/WorkFiles/Results/Comprehensiveness_Raw_Cumulative/Stats/results.json"

# Open the file in write mode and use json.dump to write the dictionary to the file
with open(file_path, 'w') as json_file:
    json.dump(results, json_file,cls=NpEncoder)



# Understand Stochasticity

In [34]:
anchor_topic_model = BERTopic()
topics, probs = anchor_topic_model.fit_transform(docs)
topic_list = anchor_topic_model.get_topic_info()["Representation"]

# forming doc -> topic pairing
df_basic_mapping_1 = pd.DataFrame({"Document": docs, "Topic": topics})
df_basic_mapping_1

Unnamed: 0,Document,Topic
0,From where s my thing Subject WHAT car is this...,6
1,From Guy Kuo Subject SI Clock Poll Final Call ...,204
2,From Thomas E Willis Subject PB questions Orga...,-1
3,From Joe Green Subject Re Weitek P9000 Organiz...,-1
4,From Jonathan McDowell Subject Re Shuttle Laun...,199
...,...,...
11309,From Jim Zisfein Subject Re Migraines and scan...,66
11310,From Subject Screen Death Mac Plus 512 Lines 2...,106
11311,From Will Estes Subject Mounting CPU Cooler in...,-1
11312,From Steven Collins Subject Re Sphere from 4 p...,11


In [20]:
topics[:5]

[-1, 178, 18, -1, 184]

In [35]:
anchor_topic_model = BERTopic()
topics, probs = anchor_topic_model.fit_transform(docs)
topic_list = anchor_topic_model.get_topic_info()["Representation"]

# forming doc -> topic pairing
df_basic_mapping_2 = pd.DataFrame({"Document": docs, "Topic": topics})
df_basic_mapping_2

Unnamed: 0,Document,Topic
0,From where s my thing Subject WHAT car is this...,5
1,From Guy Kuo Subject SI Clock Poll Final Call ...,189
2,From Thomas E Willis Subject PB questions Orga...,19
3,From Joe Green Subject Re Weitek P9000 Organiz...,-1
4,From Jonathan McDowell Subject Re Shuttle Laun...,-1
...,...,...
11309,From Jim Zisfein Subject Re Migraines and scan...,197
11310,From Subject Screen Death Mac Plus 512 Lines 2...,87
11311,From Will Estes Subject Mounting CPU Cooler in...,211
11312,From Steven Collins Subject Re Sphere from 4 p...,11


In [36]:
count_common_buckets(df_basic_mapping_1["Topic"], df_basic_mapping_2["Topic"])

278

In [23]:
compare_topics(df_basic_mapping_1, df_basic_mapping_2,6)

{'total_changes': 0,
 'total_same': 11314,
 'topic_to_noise': 0,
 'all_to_noise': 0,
 'topic_change': 0,
 'topic_same': 101}

In [30]:
def count_common_buckets(list1, list2):
    # Create dictionaries to store buckets for each list
    buckets1 = {}
    buckets2 = {}

    # Iterate through the lists to form buckets
    for index, (num1, num2) in enumerate(zip(list1, list2)):
        if num1 != -1:
            if num1 not in buckets1:
                buckets1[num1] = []
            buckets1[num1].append(index)

        if num2 != -1:
            if num2 not in buckets2:
                buckets2[num2] = []
            buckets2[num2].append(index)

    # Compare the similarity of bucket structures
    common_buckets_count = 0
    for key1 in buckets1:
        for key2 in buckets2:
            common_indices = set(buckets1[key1]) & set(buckets2[key2])
            if common_indices:
                common_buckets_count += 1

    return common_buckets_count

# Example usage:
list1 = [1,2,3,4,1,2,3,4]
list2 = [2,3,4,1,2,3,4,1]

common_buckets = count_common_buckets(list1, list2)

print("Common Buckets:", common_buckets)


Common Buckets: 4


In [39]:
def run_stochasticity_count(runs: int):
    counts = []
    for i in tqdm(range(runs)):
        anchor_topic_model = BERTopic()
        topics, probs = anchor_topic_model.fit_transform(docs)
        topic_list = anchor_topic_model.get_topic_info()["Representation"]

        # forming doc -> topic pairing
        df_basic_mapping_1 = pd.DataFrame({"Document": docs, "Topic": topics})

        anchor_topic_model = BERTopic()
        topics, probs = anchor_topic_model.fit_transform(docs)
        topic_list = anchor_topic_model.get_topic_info()["Representation"]

        # forming doc -> topic pairing
        df_basic_mapping_2 = pd.DataFrame({"Document": docs, "Topic": topics})

        counts.append(count_common_buckets(df_basic_mapping_1["Topic"], df_basic_mapping_2["Topic"]))

    return counts 

run_stochasticity_count(
    runs = 5
)

[292, 285, 292, 284, 291]

# Run centroid tests, save etc

In [77]:
import numpy as np 
import torch
for topic_i in tqdm(range(100,k)):
    centroid_test_results[f"Topic_{topic_i}"] = centroid_tests(list(ablation_top_k_topics[f"Topic_{topic_i}"].keys()))
print("========Centroid Tests done========")

100%|██████████| 100/100 [00:00<00:00, 181.88it/s]






In [25]:
import json
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


with open('Results/Ablation_Final/ablation_RAW_100.json', 'w') as fp:
    json.dump(ablation_top_k_topics_RAW, fp,cls=NpEncoder)

# with open('Results/Ablation_Final/centroid_result_200.json', 'w') as fp:
    # json.dump(centroid_test_results, fp,cls=NpEncoder)

In [None]:
while True:
    pass

# c-tf-idf numbers and rank correlation

In [42]:
anchor_topic_model = BERTopic()
topics, probs = anchor_topic_model.fit_transform(docs)
topic_list = anchor_topic_model.get_topic_info()["Representation"]

# forming doc -> topic pairing
df_basic_mapping_ = pd.DataFrame({"Document": docs, "Topic": topics})

In [45]:
len(set(topics))

245

In [56]:
anchor_topic_model.topic_representations_

{-1: [('the', 0.004001173393700668),
  ('to', 0.0038867828699836234),
  ('for', 0.0037828558747706113),
  ('of', 0.0037471069328812957),
  ('and', 0.0037246135155925863),
  ('in', 0.0036972309257993475),
  ('is', 0.003580020514716617),
  ('from', 0.003566082473923622),
  ('it', 0.003534473143340908),
  ('you', 0.0035188762860559066)],
 0: [('clipper', 0.017168714838490067),
  ('key', 0.015362246416456277),
  ('encryption', 0.01518368823616912),
  ('chip', 0.015018226152477948),
  ('escrow', 0.010645646121140998),
  ('keys', 0.00945704480498497),
  ('government', 0.008645721787624669),
  ('crypto', 0.007864839285614805),
  ('secure', 0.007650991382692927),
  ('nsa', 0.00749946145173624)],
 1: [('keyboard', 0.00977220058729129),
  ('mail', 0.007593759020955795),
  ('health', 0.007308016243972676),
  ('tobacco', 0.0070277384452378765),
  ('typing', 0.0062473224906844085),
  ('address', 0.006163866360147878),
  ('rsi', 0.0061630756713715405),
  ('nist', 0.005780655340203438),
  ('list', 0.

In [None]:
import scipy.stats

# Example data
x = [25, 15, 10, 5, 30]
y = [5, 10, 15, 25, 30]

# Calculate Spearman's rank correlation coefficient
rho, p_value = scipy.stats.spearmanr(x, y)

# Print the result
print(f"Spearman's rank correlation coefficient: {rho}")
print(f"P-value: {p_value}")

# Interpret the result
if p_value < 0.05:
    print("The correlation is statistically significant.")
else:
    print("The correlation is not statistically significant.")


# Sufficiency

In [None]:
def filter_documents(documents, allowed_words):
    filtered_documents = []

    for document in documents:
        filtered_document = []
        for word in document.split():
            if word in allowed_words:
                filtered_document.append(word)
        filtered_documents.append(" ".join(filtered_document))

    return filtered_documents

In [None]:

def check_topic_for_comprehensiveness_fit( docs: List[str], k: int) -> Dict[str, str]:
    """
    Take as input a list of topics and the initial documents, perturbs the documents
    by keeping only the topic words present in the top-k topics and see how many of the 
    documents fall into the same bin as before. 
    """
    ablation_mappings = {}
    anchor_topic_model = BERTopic()
    topics, probs = anchor_topic_model.fit_transform(docs)
    topic_list = anchor_topic_model.get_topic_info()["Representation"]

    # forming doc -> topic pairing
    df_basic_mapping = pd.DataFrame({"Document": docs, "Topic": topics})


    for topic in tqdm(topic_list[:k+1]):
        new_docs = remove_word_from_list(word, docs)
        new_topics, probs = anchor_topic_model.transform(new_docs)
        df_new_mapping = pd.DataFrame({"Document": docs, "Topic": new_topics})

        ablation_mappings[word] =  compare_topics(df_basic_mapping, df_new_mapping)

    return ablation_mappings
