In [1]:
from openai import OpenAI
import random

client = OpenAI(api_key='')

datasets = ['twitter', 'facebook_2020', 'yelp_reviews', 'amazon', 'bbc_news', '20_newsgroup', 'imdb']
models = ['BERTopic', 'CTM', 'GensimLDA', 'NMF', 'Mallet_LDA']
paths = [data + '/' + model + '/run_' + str(run) + '/topics_100.txt' for data in datasets for model in models for run in range(1, 6)]

# paths has all possible datasets, models, and runs (8 * 5 * 5 = 200 paths)

topics_per_run = 20
words_per_run = 5

output = []

for path in paths:

  f = open(path, "r")
  list_f = list(f)
  list_f = [i for i in list_f if len(i) > 10] # don't include weird, shorter topics

  if "BERTopic" in path:
    list_f = list_f[1:] # topic 0 is not useful in BERTopic
    
  topics = [[i.strip() for i in topic.split(",")] for topic in random.sample(list_f, topics_per_run)]
  topics_4 = [topic[0:words_per_run - 1] for topic in topics] # -1 since intruder word

  intruder_words = []
  returns = []

  for i, topic in enumerate(topics_4):

    intruder_topic_index = random.randint(0, 4)
    intruder_topic = topics_4[intruder_topic_index]
    intruder_words_excluded = [word for word in topics[intruder_topic_index][0:20] if word not in topic]
    intruder_word = random.choice(intruder_words_excluded)
    topic.append(intruder_word)
    intruder_words.append(intruder_word)
    random.shuffle(topic) # shuffles to avoid bias
    returns.append((topic, intruder_word))
  
  for i in returns:
    system_role = "You are an NLP researcher trying to evaluate the quality of topics created by a topic modeling algorithm."
    prompt = """Given a set of words, identify which word is the intruder word (i.e. the word that does not belong in the topic). Here is an example input and output: 

    Input: Topic: [floppy, alphabet, computer, processor, memory disk]
    Output: Intruder word: alphabet 

    Please, do this task for the following input. You must pick a word from the input set:

    Input: Topic: """ + str(i[0]) + """
    Output: Intruder word: """

    # commented out for safe-guarding -- don't want to run mistakenly and increase costs
    completion = client.chat.completions.create(
      model="gpt-4",
      messages=[
        {"role": "system", "content": system_role}, 
        {"role": "user", "content": prompt} 
      ],
      temperature = 0
    )

    prediction = completion.choices[0].message.content
    prediction = "".join("".join(prediction.split("'")).split('"'))

    # order of output is: (dataset, model, list of words, actual intruder word, predicted intruder word, if prediction is in input list)
    output.append((path.split("/")[0], path.split("/")[1], i[0], i[1], prediction, prediction in i[0]))

# should take ~40 minutes

In [2]:
# where prediction not in list (errors that we have to re-run or pick another topic for)
bad_reads = [i for i in output if not i[-1]]

In [3]:
for i in bad_reads:
    print(i)

('twitter', 'Mallet_LDA', ['very', 'may', 'might', 'tomorrow', "you're"], 'tomorrow', 'youre', False)
('twitter', 'Mallet_LDA', ['women', 'being', "let's", 'because', 'men'], "let's", 'lets', False)
('facebook_2020', 'Mallet_LDA', ['money', "mother's", 'their', 'pay', 'paid'], "mother's", 'mothers', False)
('yelp_reviews', 'Mallet_LDA', ['art', "wasn't", "i'm", 'pretty', 'bit'], 'art', 'wasnt', False)
('yelp_reviews', 'Mallet_LDA', ["wasn't", 'average', 'better', 'bit', 'pretty'], "wasn't", 'wasnt', False)
('yelp_reviews', 'Mallet_LDA', ['soup', 'chinese', 'chicken', "i'd", 'rice'], "i'd", 'id', False)
('yelp_reviews', 'Mallet_LDA', ['though', "wasn't", 'old', 'pretty', 'bit'], 'old', 'wasnt', False)
('amazon', 'Mallet_LDA', ['print', 'ink', 'pen', "doesn't", 'paper'], "doesn't", 'doesnt', False)
('amazon', 'Mallet_LDA', ['year', "isn't", 'old', 'baby', 'kids'], "isn't", 'isnt', False)
('amazon', 'Mallet_LDA', ['coffee', 'bottle', "can't", 'cup', 'water'], "can't", 'cant', False)
('ama

In [4]:
# usually all assigned to 0, but need to carefully analyze results from cell above to change values around to make it most accurate
# (sometimes GPT-4 removes appostraphes, punctuation, etc. but its correct -- correct for that model + dataset +1)
# (sometimes returns bad read or an answer not in the options -- bad_reads for that model + dataset +1)

CSS_bad_reads = {
  'BERTopic': 0,
  'CTM': 0,
  'GensimLDA': 0,
  'NMF': 0,
  'Mallet_LDA': 0
}

non_CSS_bad_reads = {
  'BERTopic': 0,
  'CTM': 2,
  'GensimLDA': 0,
  'NMF': 13,
  'Mallet_LDA': 0
}

CSS_correct = {
  'BERTopic': 0,
  'CTM': 0,
  'GensimLDA': 0,
  'NMF': 0,
  'Mallet_LDA': 2
}

non_CSS_correct = {
  'BERTopic': 0,
  'CTM': 0,
  'GensimLDA': 0,
  'NMF': 0,
  'Mallet_LDA': 11
}

In [5]:
for i in output:
  if i[3] == i[4]:
    if i[0] in ['twitter', 'fb', 'qualtrics']:
      CSS_correct[i[1]] += 1
    else:
      non_CSS_correct[i[1]] += 1

print("CSS Counts: " + str(CSS_correct))
print("Non-CSS Counts: " + str(non_CSS_correct))

print()

CSS_acc = {
  'BERTopic': 100 * CSS_correct['BERTopic'] / (300 - CSS_bad_reads['BERTopic']),
  'CTM': 100 * CSS_correct['CTM'] / (300 - CSS_bad_reads['CTM']),
  'GensimLDA': 100 * CSS_correct['GensimLDA'] / (300 - CSS_bad_reads['GensimLDA']),
  'NMF': 100 * CSS_correct['NMF'] / (300 - CSS_bad_reads['NMF']),
  'Mallet_LDA': 100 * CSS_correct['Mallet_LDA'] / (300 - CSS_bad_reads['Mallet_LDA'])
}

non_CSS_acc = {
  'BERTopic': 100 * non_CSS_correct['BERTopic'] / (500 - non_CSS_bad_reads['BERTopic']),
  'CTM': 100 * non_CSS_correct['CTM'] / (500 - non_CSS_bad_reads['CTM']),
  'GensimLDA': 100 * non_CSS_correct['GensimLDA'] / (500 - non_CSS_bad_reads['GensimLDA']),
  'NMF': 100 * non_CSS_correct['NMF'] / (500 - non_CSS_bad_reads['NMF']),
  'Mallet_LDA': 100 * non_CSS_correct['Mallet_LDA'] / (500 - non_CSS_bad_reads['Mallet_LDA'])
}

print("CSS Accuracy: " + str(CSS_acc))
print("Non-CSS Accuracy: " + str(non_CSS_acc))

CSS Counts: {'BERTopic': 76, 'CTM': 55, 'GensimLDA': 35, 'NMF': 47, 'Mallet_LDA': 70}
Non-CSS Counts: {'BERTopic': 456, 'CTM': 345, 'GensimLDA': 267, 'NMF': 196, 'Mallet_LDA': 423}

CSS Accuracy: {'BERTopic': 25.333333333333332, 'CTM': 18.333333333333332, 'GensimLDA': 11.666666666666666, 'NMF': 15.666666666666666, 'Mallet_LDA': 23.333333333333332}
Non-CSS Accuracy: {'BERTopic': 91.2, 'CTM': 69.27710843373494, 'GensimLDA': 53.4, 'NMF': 40.24640657084189, 'Mallet_LDA': 84.6}


In [6]:
# write to file
import json

with open('CSS_counts.txt', 'w') as f:
    f.write(json.dumps(CSS_correct))

with open('non_CSS_counts.txt', 'w') as f:
    f.write(json.dumps(non_CSS_correct))

with open('CSS_acc.txt', 'w') as f:
    f.write(json.dumps(CSS_acc))

with open('non_CSS_acc.txt', 'w') as f:
    f.write(json.dumps(non_CSS_acc))

with open('intruder_detection_outputs.txt', 'w') as f:
  for item in output:
    f.write(str(item) + "\n")

In [7]:
output = []

with open('intruder_detection_outputs.txt', 'r') as file:
    for line in file:
        line = line[1:-1]
        arr = line.split(", ")

        dataset = arr[0][1:-1]
        model = arr[1][1:-1]
        arr = arr[2:]

        new_line = ", ".join(arr)

        arr = new_line.split("[")[1].split("]")

        list_of_words = arr[0].split(', ')
        list_of_words = [word[1:-1] for word in list_of_words]

        arr = arr[1].strip().split(', ')[1:]

        actual_intruder = arr[0][1:-1]
        predicted_intruder = arr[1][1:-1]

        arr = arr[2:]
        in_list = ", ".join(arr).strip()[:-1]

        if in_list == 'True':
            in_list = True
        else:
            in_list = False

        tup = (dataset, model, list_of_words, actual_intruder, predicted_intruder, in_list)
        output.append(tup)

In [8]:
# bbc_news

bbc_intruder = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

bbc_intruder_bad_reads = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

for i in output:
    if i[0] == 'bbc_news':
        if i[3] == i[4]:
            bbc_intruder[i[1]] += 1

bbc_intruder = {i : bbc_intruder[i] / (100 - bbc_intruder_bad_reads[i]) for i in bbc_intruder.keys()}
bbc_intruder

{'GensimLDA': 0.43,
 'Mallet_LDA': 0.7,
 'CTM': 0.64,
 'BERTopic': 0.79,
 'NMF': 0.23}

In [9]:
# 20_newsgroup

ng_intruder = {
    'GensimLDA': 0,
    'Mallet_LDA': 2,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

ng_intruder_bad_reads = {
    'GensimLDA': 0,
    'Mallet_LDA': 1,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 15
}

for i in output:
    if i[0] == '20_newsgroup':
        if i[3] == i[4]:
            ng_intruder[i[1]] += 1

ng_intruder = {i : ng_intruder[i] / (100 - ng_intruder_bad_reads[i]) for i in ng_intruder.keys()}
ng_intruder

{'GensimLDA': 0.5,
 'Mallet_LDA': 0.7878787878787878,
 'CTM': 0.64,
 'BERTopic': 0.74,
 'NMF': 0.6470588235294118}

In [10]:
# imdb

imdb_intruder = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

imdb_intruder_bad_reads = {
    'GensimLDA': 0,
    'Mallet_LDA': 1,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

for i in output:
    if i[0] == 'imdb':
        if i[3] == i[4]:
            imdb_intruder[i[1]] += 1

imdb_intruder = {i : imdb_intruder[i] / (100 - imdb_intruder_bad_reads[i]) for i in imdb_intruder.keys()}
imdb_intruder

{'GensimLDA': 0.34,
 'Mallet_LDA': 0.5454545454545454,
 'CTM': 0.41,
 'BERTopic': 0.63,
 'NMF': 0.2}

In [11]:
# amazon

amazon_intruder = {
    'GensimLDA': 0,
    'Mallet_LDA': 1,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

amazon_intruder_bad_reads = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

for i in output:
    if i[0] == 'amazon':
        if i[3] == i[4]:
            amazon_intruder[i[1]] += 1

amazon_intruder = {i : amazon_intruder[i] / (100 - amazon_intruder_bad_reads[i]) for i in amazon_intruder.keys()}
amazon_intruder

{'GensimLDA': 0.38,
 'Mallet_LDA': 0.7,
 'CTM': 0.55,
 'BERTopic': 0.85,
 'NMF': 0.4}

In [12]:
# yelp_reviews

yelp_intruder = {
    'GensimLDA': 1,
    'Mallet_LDA': 1,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

yelp_intruder_bad_reads = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

for i in output:
    if i[0] == 'yelp_reviews':
        if i[3] == i[4]:
            yelp_intruder[i[1]] += 1

yelp_intruder = {i : yelp_intruder[i] / (100 - yelp_intruder_bad_reads[i]) for i in yelp_intruder.keys()}
yelp_intruder

{'GensimLDA': 0.62,
 'Mallet_LDA': 0.75,
 'CTM': 0.59,
 'BERTopic': 0.8,
 'NMF': 0.31}

In [13]:
# facebook_2020

fb_intruder = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

fb_intruder_bad_reads = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

for i in output:
    if i[0] == 'facebook_2020':
        if i[3] == i[4]:
            fb_intruder[i[1]] += 1

fb_intruder = {i : fb_intruder[i] / (100 - fb_intruder_bad_reads[i]) for i in fb_intruder.keys()}
fb_intruder

{'GensimLDA': 0.41,
 'Mallet_LDA': 0.69,
 'CTM': 0.62,
 'BERTopic': 0.75,
 'NMF': 0.27}

In [14]:
# twitter

twitter_intruder = {
    'GensimLDA': 0,
    'Mallet_LDA': 1,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

twitter_intruder_bad_reads = {
    'GensimLDA': 0,
    'Mallet_LDA': 0,
    'CTM': 0,
    'BERTopic': 0,
    'NMF': 0
}

for i in output:
    if i[0] == 'twitter':
        if i[3] == i[4]:
            twitter_intruder[i[1]] += 1

twitter_intruder = {i : twitter_intruder[i] / (100 - twitter_intruder_bad_reads[i]) for i in twitter_intruder.keys()}
twitter_intruder

{'GensimLDA': 0.35,
 'Mallet_LDA': 0.69,
 'CTM': 0.55,
 'BERTopic': 0.76,
 'NMF': 0.47}