In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/abpal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
documents = docs
len(documents)

18846

In [3]:
documents[0]

"\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [3]:
def preprocess_data(documents):
 stop_words = stopwords.words("english")
 
 # Tokenize and remove stopwords
 texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in documents]
 
 return texts

processed_texts = preprocess_data(documents)


In [4]:
# Create Dictionary
id2word = corpora.Dictionary(processed_texts)
# Create Corpus
texts = processed_texts
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [5]:
# Set number of topics
num_topics = 100
# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, alpha="auto", per_word_topics=True)

In [6]:
# Print the keywords for each topic
# pprint(lda_model.show_topics(num_topics=100, num_words=10,formatted=False))

def parse_list_to_dict(data):
    result_dict = {}
    for item in data:
        index = item[0]
        word_list = dict(item[1])
        result_dict[index] = word_list
    return result_dict

dicts = parse_list_to_dict(lda_model.show_topics(num_topics=100, num_words=10,formatted=False))
dicts[0]

{'god': 0.040325914,
 'christian': 0.02874682,
 'religion': 0.027084064,
 'believe': 0.022303417,
 'christians': 0.01709233,
 'belief': 0.016534213,
 'religious': 0.016137004,
 'atheism': 0.015821923,
 'people': 0.015684381,
 'bible': 0.015400452}

In [20]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=id2word, coherence="c_v")
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence Score: ", coherence_lda)

Coherence Score:  0.5776705610954899


In [14]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model,corpus=corpus,dictionary=id2word)

In [15]:
vis

In [7]:
x= lda_model.show_topics(num_topics=100, num_words=10,formatted=False)
x

[(0,
  [('god', 0.040325914),
   ('christian', 0.02874682),
   ('religion', 0.027084064),
   ('believe', 0.022303417),
   ('christians', 0.01709233),
   ('belief', 0.016534213),
   ('religious', 0.016137004),
   ('atheism', 0.015821923),
   ('people', 0.015684381),
   ('bible', 0.015400452)]),
 (1,
  [('book', 0.047033936),
   ('books', 0.025569022),
   ('history', 0.01792889),
   ('press', 0.010869997),
   ('first', 0.010809011),
   ('university', 0.009686387),
   ('professor', 0.008368503),
   ('general', 0.0081482865),
   ('published', 0.008079528),
   ('page', 0.0075855027)]),
 (2,
  [('space', 0.061670884),
   ('nasa', 0.02512824),
   ('earth', 0.023826081),
   ('launch', 0.016269704),
   ('orbit', 0.013425086),
   ('moon', 0.013304539),
   ('solar', 0.012509669),
   ('spacecraft', 0.011388142),
   ('satellite', 0.011144928),
   ('planet', 0.010979248)]),
 (3,
  [('went', 0.026444994),
   ('came', 0.02056145),
   ('back', 0.0201897),
   ('said', 0.01912465),
   ('one', 0.0187957),

In [11]:
x= lda_model.show_topics(num_topics=100, num_words=10,formatted=False)
# x= lda_model.show_topics()
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Topics and Words
for topic,words in topics_words:
    print(str(topic)+ "::"+ str(words))
print()

#Below Code Prints Only Words 
for topic,words in topics_words:
    print(" ".join(words))

0::['god', 'christian', 'religion', 'believe', 'christians', 'belief', 'religious', 'atheism', 'people', 'bible']
1::['book', 'books', 'history', 'press', 'first', 'university', 'professor', 'general', 'published', 'page']
2::['space', 'nasa', 'earth', 'launch', 'orbit', 'moon', 'solar', 'spacecraft', 'satellite', 'planet']
3::['went', 'came', 'back', 'said', 'one', 'home', 'took', 'told', 'saw', 'started']
4::['printer', 'hp', 'paper', 'print', 'laser', 'fonts', 'printing', 'printers', 'ink', 'bob']
5::['procedure', 'catholics', 'latin', 'mass', 'doug', 'periods', 'sales', 'scan', 'wright', 'masses']
6::['xview', 'openwindows', 'japanese', 'lds', 'encourage', 'patch', 'pointer', 'shared', 'bugs', 'languages']
7::['anyone', 'thanks', 'would', 'know', 'please', 'help', 'like', 'could', 'looking', 'hi']
8::['government', 'war', 'rights', 'people', 'world', 'countries', 'human', 'power', 'political', 'country']
9::['edu', 'com', 'cs', 'apr', 'article', 'ca', 'david', 'subject', 'gov', 'ma

In [13]:
x=lda_model.show_topics()

twords={}
for topic,word in x:
    twords[topic]=re.sub('[^A-Za-z ]+', '', word)
print(twords)

{44: 'ax  max  pl  ei  giz  tm  di  wm  ey  tq', 65: 'northern  ne  muscle  triple  est  je  qu  programme  ce  le', 35: 'mk  ah  pl  di  mq  sl  mn  tm  wm  m', 42: 'echo  lc  cg  defined  define  ivf  tex  ar  dd  ted', 78: 'turkish  greek  turkey  greece  turks  ed  ottoman  population  genocide  istanbul', 14: 'would  money  make  much  cost  pay  use  one  could  private', 7: 'anyone  thanks  would  know  please  help  like  could  looking  hi', 54: 'get  one  like  good  got  back  bike  well  new  right', 69: 'one  would  people  think  even  like  see  say  may  way', 43: 'think  know  people  get  would  like  going  want  time  go'}


In [5]:
def run_lda(docs: list,repeats : int,savepath : str, topn : int) -> None:
    """Run LDA on the documents, and save the avg Coherence rating in the given path

    Args:
        docs (list): _description_
        repeats (int): _description_
        savepath (str): _description_
    """

    coherences = []
    for _ in range(repeats):
        processed_texts = preprocess_data(docs)

        # Create Dictionary
        id2word = corpora.Dictionary(processed_texts)
        # Create Corpus
        texts = processed_texts
        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]
    
        # Set number of topics
        num_topics = 100
        # Build LDA model
        lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, alpha="auto", per_word_topics=True)

        coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=id2word, coherence="c_v", topn = topn) # types {'u_mass', 'c_v', 'c_uci', 'c_npmi'}
        coherence_lda = coherence_model_lda.get_coherence()
        coherences.append(coherence_lda)

    with open(savepath,"w") as sp:
        sp.write("".join(str(coherences)))
        sp.write(f"Mean : {np.mean(coherences)} STD : {np.std(coherences)}")
        

In [26]:
run_lda(documents,3,"final_result/LDA/LDA_Result_20ng.txt",topn=5)

In [27]:
def read_data(path):
    df = pd.read_csv(path)
    return df['text'].tolist()

docs = read_data("data/nyt2020.csv")
run_lda(docs,3,"final_result/LDA/LDA_Result_nyt.txt",5)

In [28]:
docs = read_data("data/wiki_en_10000.csv")
run_lda(docs,3,"final_result/LDA/LDA_Result_wiki.txt",5)

# Octis for NYT, Wiki

In [6]:
def save_topics_lda(docs):
    processed_texts = preprocess_data(docs)

    # Create Dictionary
    id2word = corpora.Dictionary(processed_texts)
    # Create Corpus
    texts = processed_texts
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Set number of topics
    num_topics = 100
    # Build LDA model
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, alpha="auto", per_word_topics=True)
    x= lda_model.show_topics(num_topics=100, num_words=10,formatted=False)
    # x= lda_model.show_topics()
    topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

    #Below Code Prints Topics and Words
    for topic,words in topics_words:
        print(str(topic)+ "::"+ str(words))
    print()

    #Below Code Prints Only Words 
    for topic,words in topics_words:
        print(" ".join(words))


In [22]:
nyt = list(pd.read_csv("data/nyt2020.csv")["text"])
wiki = list(pd.read_csv("data/wiki_en_10000.csv")["text"])

In [23]:
save_topics_lda(nyt)

0::['new', 'york', 'city', 'week', 'times', 'change', 'climate', 'baseball', 'homes', 'tales']
1::['lives', 'book', 'inside', 'books', 'read', 'pay', 'talk', 'center', 'listen', 'ask']
2::['party', 'voters', 'republican', 'despite', 'short', 'term', 'biggest', 'hold', 'democratic', 'pressure']
3::['came', 'harris', 'battle', 'six', 'holidays', 'kamala', 'wisconsin', 'field', 'threats', 'water']
4::['end', 'care', 'wave', 'fall', 'likely', 'access', 'generation', 'room', 'host', 'cancer']
5::['control', 'russia', 'south', 'military', 'victory', 'perfect', 'security', 'africa', 'amy', 'wide']
6::['future', 'service', 'variety', 'someone', 'forget', 'races', 'income', 'train', 'emily', 'friday']
7::['trump', 'president', 'election', 'republicans', 'administration', 'said', 'federal', 'democrats', 'vote', 'voting']
8::['british', 'street', 'park', 'wall', 'defense', 'asking', 'drawing', 'traditional', 'shootings', 'inspire']
9::['among', 'let', 'key', 'fear', 'remain', 'allies', 'many', 't

In [24]:
save_topics_lda(wiki)

0::['published', 'book', 'writers', 'books', 'russian', 'american', 'press', 'century', 'women', 'people']
1::['school', 'university', 'college', 'high', 'students', 'schools', 'new', 'state', 'building', 'education']
2::['river', 'county', 'lake', 'state', 'rivers', 'states', 'illinois', 'wisconsin', 'arkansas', 'united']
3::['acc', 'jeeves', 'beryl', 'antioxidant', 'difranco', 'ayckbourn', 'tae', 'jarvis', 'patel', 'antioxidants']
4::['color', 'strip', 'comic', 'red', 'paper', 'comics', 'artists', 'strips', 'colors', 'capp']
5::['germany', 'german', 'european', 'europe', 'republic', 'croatia', 'states', 'war', 'united', 'czech']
6::['apollo', 'space', 'earth', 'star', 'mission', 'moon', 'lunar', 'crew', 'nasa', 'stars']
7::['economic', 'world', 'million', 'country', 'economy', 'trade', 'largest', 'bank', 'industry', 'government']
8::['cay', 'bulgaria', 'bulgarian', 'azerbaijan', 'union', 'armenia', 'trade', 'armenian', 'russian', 'columbus']
9::['augustine', 'td', 'ax', 'fianna', 'fá

In [32]:
processed_texts = preprocess_data(docs)

# Create Dictionary
id2word = corpora.Dictionary(processed_texts)
# Create Corpus
texts = processed_texts
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Set number of topics
num_topics = 100
# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, alpha="auto", per_word_topics=True)
x= lda_model.show_topics(num_topics=100, num_words=10,formatted=False)
# x= lda_model.show_topics()
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

In [65]:
topics_words[1][1][0]

'book'

In [16]:
get_document_topics = [lda_model.get_document_topics(item) for item in corpus]

In [55]:
get_document_topics[0]

[(7, 0.10452608),
 (28, 0.026481865),
 (34, 0.024805507),
 (43, 0.017719239),
 (69, 0.014550598),
 (76, 0.02607383),
 (96, 0.6881275)]

In [6]:
def get_max_topics(get_document_topics):
  max_topics = []

  for sublist in get_document_topics:
      max_topic = max(sublist, key=lambda x: x[1])
      max_topics.append(max_topic[0])

  return(max_topics)

# Ablations for LDA

In [7]:
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import PartOfSpeech,MaximalMarginalRelevance,KeyBERTInspired
from typing import List, Dict
from tqdm import tqdm
import os 
import json
from os.path import join
from src.utils import clean_dataset,list_to_dict,convert_ctfidf,NpEncoder

def remove_word_from_list(word_to_remove: str, string_list: List[str]) -> List[str]:
    """Removes a given word from the a List of Strings.

    Args:
        word_to_remove (str): The given word
        string_list (List[str]): The list of strings from which to remove the word.

    Returns:
        List[str]: The resulting list of strings.
    """
    return [string.replace(word_to_remove, "") for string in string_list]

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
get_document_topics = [lda_model.get_document_topics(item) for item in corpus]
doc2topic1 = get_max_topics(get_document_topics)
new_doc = remove_word_from_list("god",docs)

processed_texts_2 = preprocess_data(new_doc)
new_doc_bow = [id2word.doc2bow(text) for text in processed_texts_2]

get_document_topics2 = [lda_model.get_document_topics(item) for item in new_doc_bow]
doc2topic2 = get_max_topics(get_document_topics2)

In [None]:
df1 = pd.DataFrame({"Topic":doc2topic1})
df2 = pd.DataFrame({"Topic":doc2topic2})

In [8]:
def compare_topics(df1: pd.DataFrame, df2: pd.DataFrame, topic_num: int) -> Dict[str,int]:
    """Compare the Topic Columns in two Dataframes and return a results dict.

    Args:
        df1 (pd.DataFrame): Input Dataframe 1
        df2 (pd.DataFrame): Input Dataframe 2
        topic_num (int): The Topic Number for which we calculate the results.

    Returns:
        dict: A dictionary containing changes from df1 to df2 : 
                a. Total Changes : Total Documents that changed it's topic assignment.
                b. Total Same : Total Documents that remained in the same topics. 
                c. Topic to Noise : Total Documents in "topic_num" that changed to noise. 
                d. All to Noise : Total Documents that changed to noise. 
                e. Topic Change : Total Documents in "topic_num" that changed to some other topic. 
                f. Topic Same : Total Documents in "topic_num" that remained in the same topic. 
    """
    ## 1/ Count the number of elements that changed between the two dataframes in Topic column
    # Select only the "Topic" column from each dataframe
    topics1 = df1["Topic"]
    topics2 = df2["Topic"]
    # Compare the two columns and count the number of changes
    changes = (topics1 != topics2).sum()
    # Count the number of elements that remained the same
    same = (topics1 == topics2).sum()

    ## 2/ Count the number of elements that changed to -1 or noise from "topic_num" topic.
    # Find rows where "Topic" changed from a non-negative value to -1
    changed_rows = (df1["Topic"] == topic_num) & (df2["Topic"] == -1)
    changed_rows_2 = (df1["Topic"] >= 0) & (df2["Topic"] == -1)
    # Extract the rows that satisfy the condition
    changed_rows_df = df1[changed_rows]
    changed_rows_df_2 = df1[changed_rows_2]
    # Get the number of rows that changed
    top2noise = len(changed_rows_df)
    all2noise = len(changed_rows_df_2)

    ## 3/ Check number of changes in topic constricted to "topic_num" topic.
    # Select rows in df1 where "Topic" is equal to the given topic
    rows_with_given_topic_df1 = df1[df1["Topic"] == topic_num]
    # Find the corresponding rows in df2
    corresponding_rows_df2 = df2.loc[rows_with_given_topic_df1.index]
    # Count the number of rows where the "Topic" value changed
    num_changed_rows = (
        rows_with_given_topic_df1["Topic"] != corresponding_rows_df2["Topic"]
    ).sum()
    # Count the number of rows where the "Topic" value remained the same
    num_same_rows = (
        rows_with_given_topic_df1["Topic"] == corresponding_rows_df2["Topic"]
    ).sum()

    results = {
        "total_changes": changes,
        "total_same": same,
        "topic_to_noise": top2noise,
        "all_to_noise": all2noise,
        "topic_change": num_changed_rows,
        "topic_same": num_same_rows,
    }

    return results

In [49]:
compare_topics(df1,df2,1)

{'total_changes': 179,
 'total_same': 18667,
 'topic_to_noise': 0,
 'all_to_noise': 0,
 'topic_change': 2,
 'topic_same': 83}

In [7]:
def raw_comprehensiveness_checks_lda( docs: List[str], k: int) -> pd.DataFrame:
    """
    Take as input a list of topics and the initial documents, perturbs the documents
    by removing one topic word after another and repeats the modeling to find if the
    topic changes.
    """
    final_ablation_mappings = {}

    # Process the documents and train a LDA model
    processed_texts = preprocess_data(docs)

    # Create Dictionary
    id2word = corpora.Dictionary(processed_texts)
    # Create Corpus
    texts = processed_texts
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Set number of topics
    num_topics = k
    # Build LDA model
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, alpha="auto", per_word_topics=True)
    x = lda_model.show_topics(num_topics=100, num_words=10,formatted=False)
    xdict = parse_list_to_dict(x)

    topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

    for topic in tqdm(range(1,k)):
        ablation_mapping = {}
        for words in topics_words[topic][1]:
            get_document_topics = [lda_model.get_document_topics(item) for item in corpus]
            doc2topic1 = get_max_topics(get_document_topics)
            new_doc = remove_word_from_list(words,docs)

            processed_texts_2 = preprocess_data(new_doc)
            new_doc_bow = [id2word.doc2bow(text) for text in processed_texts_2]

            get_document_topics2 = [lda_model.get_document_topics(item) for item in new_doc_bow]
            doc2topic2 = get_max_topics(get_document_topics2)

            df1 = pd.DataFrame({"Topic":doc2topic1})
            df2 = pd.DataFrame({"Topic":doc2topic2})

            ablation_mapping[words] = compare_topics(df1,df2,topic)
    
        final_ablation_mappings[topic] = ablation_mapping
        
    return final_ablation_mappings,xdict

def get_max_topics(get_document_topics):
  max_topics = []

  for sublist in get_document_topics:
      max_topic = max(sublist, key=lambda x: x[1])
      max_topics.append(max_topic[0])

  return(max_topics)

def parse_list_to_dict(data):
    result_dict = {}
    for item in data:
        index = item[0]
        word_list = dict(item[1])
        result_dict[index] = word_list
    return result_dict

# dicts = parse_list_to_dict(lda_model.show_topics(num_topics=100, num_words=10,formatted=False))

In [8]:
ablation_res,xdict = raw_comprehensiveness_checks_lda(docs,100)

100%|██████████| 99/99 [6:37:33<00:00, 240.94s/it]  


In [9]:
import json

path = "result_2/comprehensiveness/20newsgroup/lda_model"

with open(path + "/lda_20ng.json", 'w') as json_file:
    json.dump(ablation_res, json_file,cls=NpEncoder)

with open(path + "/lda_20ng_base.json", 'w') as json_file:
    json.dump(xdict, json_file,cls=NpEncoder)

In [10]:
nyt = list(pd.read_csv("data/nyt2020.csv")["text"])
wiki = list(pd.read_csv("data/wiki_en_10000.csv")["text"])

In [43]:
ablation_res,xdict = raw_comprehensiveness_checks_lda(nyt,100)

path = "result/comprehensiveness/nyt/lda_model"
    
with open(path + "/lda_nyt.json", 'w') as json_file:
    json.dump(ablation_res, json_file,cls=NpEncoder)

with open(path + "/lda_nyt_base.json", 'w') as json_file:
    json.dump(xdict, json_file,cls=NpEncoder)

100%|██████████| 99/99 [2:01:31<00:00, 73.65s/it]  


In [11]:
ablation_res,xdict = raw_comprehensiveness_checks_lda(wiki,100)

path = "result/comprehensiveness/wiki/lda_model"
    
with open(path + "/lda_wiki.json", 'w') as json_file:
    json.dump(ablation_res, json_file,cls=NpEncoder)


with open(path + "/lda_wiki_base.json", 'w') as json_file:
    json.dump(xdict, json_file,cls=NpEncoder)

100%|██████████| 99/99 [11:09:44<00:00, 405.90s/it]  


# LDA Sufficiency

In [5]:
def filter_documents(documents, allowed_words):
    filtered_documents = []

    for document in documents:
        filtered_document = []
        for word in document.split():
            if word in allowed_words:
                filtered_document.append(word)
        filtered_documents.append(" ".join(filtered_document))

    return filtered_documents

In [10]:
def raw_sufficiency_checks_lda( docs: List[str], k: int) -> pd.DataFrame:
    """
    Take as input a list of topics and the initial documents, perturbs the documents
    by removing one topic word after another and repeats the modeling to find if the
    topic changes.
    """
    final_ablation_mappings = {}

    # Process the documents and train a LDA model
    processed_texts = preprocess_data(docs)

    # Create Dictionary
    id2word = corpora.Dictionary(processed_texts)
    # Create Corpus
    texts = processed_texts
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Set number of topics
    num_topics = k
    # Build LDA model
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, alpha="auto", per_word_topics=True)
    x = lda_model.show_topics(num_topics=100, num_words=10,formatted=False)
    xdict = parse_list_to_dict(x)

    topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

    for topic in tqdm(range(1,k)):
        ablation_mapping = {}
        for words in topics_words[topic][1]:
            get_document_topics = [lda_model.get_document_topics(item) for item in corpus]
            doc2topic1 = get_max_topics(get_document_topics)
            new_doc = filter_documents(docs,words)

            processed_texts_2 = preprocess_data(new_doc)
            new_doc_bow = [id2word.doc2bow(text) for text in processed_texts_2]

            get_document_topics2 = [lda_model.get_document_topics(item) for item in new_doc_bow]
            doc2topic2 = get_max_topics(get_document_topics2)

            df1 = pd.DataFrame({"Topic":doc2topic1})
            df2 = pd.DataFrame({"Topic":doc2topic2})

            ablation_mapping[words] = compare_topics(df1,df2,topic)
    
        final_ablation_mappings[topic] = ablation_mapping
        
    return final_ablation_mappings,xdict

def get_max_topics(get_document_topics):
  max_topics = []

  for sublist in get_document_topics:
      max_topic = max(sublist, key=lambda x: x[1])
      max_topics.append(max_topic[0])

  return(max_topics)

def parse_list_to_dict(data):
    result_dict = {}
    for item in data:
        index = item[0]
        word_list = dict(item[1])
        result_dict[index] = word_list
    return result_dict

# dicts = parse_list_to_dict(lda_model.show_topics(num_topics=100, num_words=10,formatted=False))

In [12]:
import json

ablation_res,xdict = raw_sufficiency_checks_lda(docs,100)


path = "result/sufficiency/20newsgroup/lda_model"
with open(path + "/lda_20ng.json", 'w') as json_file:
    json.dump(ablation_res, json_file,cls=NpEncoder)

with open(path + "/lda_20ng_base.json", 'w') as json_file:
    json.dump(xdict, json_file,cls=NpEncoder)

In [14]:
nyt = list(pd.read_csv("data/nyt2020.csv")["text"])
wiki = list(pd.read_csv("data/wiki_en_10000.csv")["text"])

In [15]:
ablation_res,xdict = raw_sufficiency_checks_lda(nyt,100)

path = "result/sufficiency/nyt/lda_model"
    
with open(path + "/lda_nyt.json", 'w') as json_file:
    json.dump(ablation_res, json_file,cls=NpEncoder)

with open(path + "/lda_nyt_base.json", 'w') as json_file:
    json.dump(xdict, json_file,cls=NpEncoder)

100%|██████████| 99/99 [1:25:40<00:00, 51.93s/it]


In [16]:
ablation_res,xdict = raw_sufficiency_checks_lda(wiki,100)

path = "result/sufficiency/wiki/lda_model"
    
with open(path + "/lda_wiki.json", 'w') as json_file:
    json.dump(ablation_res, json_file,cls=NpEncoder)


with open(path + "/lda_wiki_base.json", 'w') as json_file:
    json.dump(xdict, json_file,cls=NpEncoder)

100%|██████████| 99/99 [2:54:28<00:00, 105.75s/it]  


# Analysis