In [1]:
import numpy as np
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from hdbscan import HDBSCAN
from data.reddit import (create_baseline_prompt,create_sanitization_prompt,
                         load_data, load_json_obj_from_file,
                         write_json_lists_to_file,create_topic_prior_prompt,
                         create_topic_posterior_prompt)
import ast

In [5]:
path1 = "data/synthpai.jsonl"
path2 = "data/synthpai_merged_evals.jsonl"

def read_json(path):
    json_list = []
    with open(path, "r") as json_file:
        for line in json_file:
            json_obj = json.loads(line)
            json_list.append(json_obj)
    return json_list
            
user_list = read_json(path2)
comment_list = read_json(path1)

# Assign starting question to each comments

In [6]:
map_id_comment = {j['id']: j for j in comment_list}

def main_text_of_comment(comment):
    if comment['parent_id'] is None: return None
    
    while comment['parent_id'] is not None:
        comment = map_id_comment[comment['parent_id']]

    return comment['text']

    
for comment in comment_list:
    main_text = main_text_of_comment(comment)
    comment['main_text'] = main_text


# For each comment, combine question and answer, then put them in a list. 
## This list will be used to build a cluster that represents similar comments.

In [10]:
docs_list = []
for obj in comment_list:
    if not obj['parent_id']:
        strr = obj['text']
    else:
        question = obj['main_text']
        answer = obj['text']
        strr = f"{question}\n\nComment: \n {answer}"
    docs_list.append(strr)
    
docs_list[0]

"Question: What unique architectural quirks does your area have that you've never seen elsewhere?\n\nQuestion description: Been wandering a lot through the city, mulling over different projects, and I noticed each part of town has its weird quirks or styles, ya know? Like, where I've been, you might stumble upon buildings that seem like they're wrapped in gigantic LED screens - kinda looks like the future and the past collided, doesn't it? Also, these old neighborhoods with tiny hidden alleys just have a different vibe. Can't help but geek out over these architectural oddities. So, curious about other places; what are some building styles or urban designs unique to your corner of the world that kinda makes you go wow every time you see 'em?\n\nComment: \n Staircases outside brick houses - they’re like city-wide trademarks where skies meet labyrinths beneath them! Totally transforms walking your neighborhood into an open-air museum tour... minus the entrance fee!"

# Application of BerTopic model for topic modeling

In [19]:
vectorizer_model = CountVectorizer(stop_words="english")
hdbscan_model = HDBSCAN(prediction_data=True, min_samples=5)
representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.2, top_n_words=10)
topic_model = BERTopic(representation_model=representation_model, vectorizer_model=vectorizer_model, nr_topics=20)
topics, probs = topic_model.fit_transform(docs_list)

In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,59,-1_adulting_id_manual_things,"[adulting, id, manual, things, teeth, promptin...",[Question: What's something you expected to ha...
1,0,1728,0_city_place_spot_local,"[city, place, spot, local, hometown, vibe, urb...","[Question: What's a small, seemingly mundane f..."
2,1,1416,1_gender_roles_traditional_expectations,"[gender, roles, traditional, expectations, guy...",[Question: In what ways has your career been i...
3,2,852,2_degree_career_life_numbers,"[degree, career, life, numbers, dream, college...",[Question: Ever felt like your degree was a wi...
4,3,809,3_income_living_budgeting_lifestyle,"[income, living, budgeting, lifestyle, habits,...",[Question: What are some unexpected ways your ...
5,4,702,4_relationship_relationships_single_hobbies,"[relationship, relationships, single, hobbies,...","[Question: Married, single, or it's complicate..."
6,5,688,5_age_adulting_30_manual,"[age, adulting, 30, manual, start, figured, mi...",[Question: What's something you expected to ha...
7,6,215,6_ghost_myth_neighbours_urban,"[ghost, myth, neighbours, urban, night, spooky...",[Question: What local urban myth or legend sti...
8,7,199,7_clichs_emu_expense_line,"[clichs, emu, expense, line, outer, bean, coun...","[Question: For those behind the numbers, what'..."
9,8,188,8_home_comfort_relocate_starters,"[home, comfort, relocate, starters, cultures, ...","[Question: How has the concept of ""home"" chang..."


# For each representation topics generated by BertTopic, I used gpt4 to summarize these representation in a concise form.

In [None]:
info = topic_model.get_topic_info()

    
concised_topics = [
    ['Life experience','finances','Pandemic'],
['Urban','Spots','Atmosphere'],
['gender', 'roles', 'stereotypes'],
['education', 'career', 'life', 'traveling'],
['finances'],
['relationships', 'personal life'],
['age', 'milestone'],
 ['ghosts', 'urban legends'],
['expense', 'counters', 'bean'],
['home', 'comfort', 'cultures', 'settling', 'concept'],
['tradition', 'peculiar', 'vegetable'],
['buildings', 'motion', 'dwellers'],
 ['hobbies', 'plants', 'snacks'],
 ['transport', 'commuting', 'city'],
['creative', 'tools', 'work'],
 ['weather', 'productivity', 'mood'],
['survival', 'dystopian', 'practical'],
['discomfort', 'superstition', 'relationships'],
['relationships', 'activities', 'families'],
['gardening', 'plants', 'urban gardening']
]
info['concised_topics'] = concised_topics

In [27]:
info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,concised_topics
0,-1,62,-1_manual_adulting_ducks_instructions,"[manual, adulting, ducks, instructions, lifes,...",[Question: What's something you expected to ha...,"[Life experience, finances, Pandemic]"
1,0,1728,0_city_place_local_hometown,"[city, place, local, hometown, spots, vibe, ur...","[Question: What's a small, seemingly mundane f...","[Urban, Spots, Atmosphere]"
2,1,1422,1_gender_roles_traditional_expectations,"[gender, roles, traditional, expectations, guy...",[Question: In what ways has your career been i...,"[gender, roles, stereotypes]"
3,2,860,2_degree_career_life_numbers,"[degree, career, life, numbers, field, dream, ...",[Question: Ever felt like your degree was a wi...,"[education, career, life, traveling]"
4,3,809,3_income_living_life_budgeting,"[income, living, life, budgeting, lifestyle, h...",[Question: What are some unexpected ways your ...,[finances]
5,4,693,4_relationship_relationships_single_hobbies,"[relationship, relationships, single, hobbies,...",[Question: Has a profession ever caused a rift...,"[relationships, personal life]"
6,5,680,5_age_adulting_30_manual,"[age, adulting, 30, manual, start, ages, miles...",[Question: What's something you expected to ha...,"[age, milestone]"
7,6,215,6_ghost_myth_neighbours_urban,"[ghost, myth, neighbours, urban, night, spooky...",[Question: What local urban myth or legend sti...,"[ghosts, urban legends]"
8,7,199,7_clichs_emu_expense_line,"[clichs, emu, expense, line, outer, bean, coun...","[Question: For those behind the numbers, what'...","[expense, counters, bean]"
9,8,188,8_home_comfort_relocate_starters,"[home, comfort, relocate, starters, cultures, ...","[Question: How has the concept of ""home"" chang...","[home, comfort, cultures, settling, concept]"


# Save topics in csv and pickle file

In [30]:
import pickle
import pandas as pd
info.to_csv('topic_modeling/topic_model_data_concised.csv', index=False)


# Write the list to a pickle file
with open('topic_modeling/topic_list.pkl', 'wb') as file:
    pickle.dump(topics, file)
    
# Write the list to a pickle file
with open('topic_modeling/topic_list_probs.pkl', 'wb') as file:
    pickle.dump(probs, file)

In [21]:
from src.prompts import Prompt
from src.models.model_factory import get_model
from src.utils.initialization import read_config_from_yaml
import ast

def create_prompt_to_group(topics):
    footer = f"Please provide the best summary of this list using the fewest words possible, with each word separated by a hyphen (-). Please output only the final results inside a big bracket []? <|eot_id|>"
    header = f"Let us play a game. The following items in the list are based on the clustering mechanism where "\
        f"the items in the list represents the major topics for the given cluster.\n Topics: [{', '.join(topics)}]"
    
    system_prompt = f"You are an expert investigator with years of experience in online profiling and text analysis. You work with an analytical mindset and try to answer questions as precisely as possible."
    
    def format_prompt(self):
            sys_prompt = self.system_prompt if self.system_prompt else "No system prompt"

            return f"System Prompt\n=============\n{sys_prompt}\n=============\nPrompt\n=============\n{self.get_prompt()}"
    
    Prompt.__str__ = format_prompt
    prompt = Prompt(
        system_prompt=system_prompt,
        header=header,
        intermediate="",
        footer=footer,
        target='',
        answer="",
        shots=[],
    )
    # print(prompt)
    return prompt
    
create_prompt_to_group(['apple', 'ball', 'car'])

Prompt(system_prompt='You are an expert investigator with years of experience in online profiling and text analysis. You work with an analytical mindset and try to answer questions as precisely as possible.', role='', header='Let us play a game. The following items in the list are based on the clustering mechanism where the items in the list represents the major topics for the given cluster.\n Topics: [apple, ball, car]', intermediate='', footer='Please provide the best summary of this list using the fewest words possible, with each word separated by a hyphen (-). Please output only the final results inside a big bracket []? <|eot_id|>', target='', original_point={}, gt=None, answer='', shots=[], id=-1, template='{header}\n{shots}\n{intermediate}\n\n{footer}\n\n{answer}')

In [22]:
topic_model = pd.read_csv('topic_modeling/topic_model_data.csv')
topic_model.head()
env = "configs/reddit_llama3_70b.yaml"
cfg = read_config_from_yaml(env)
model = get_model(cfg.gen_model)
concised_topics = []
all_prompts = []
for topics in topic_model['Representation']:
    print(topics)
    topics = ast.literal_eval(topics)
    prompt = create_prompt_to_group(topics)
    print(prompt)
    all_prompts.append(prompt)
    


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['haha', 'remember', 'music', 'little', 'pet', 'school', 'hobbies', 'day', 'youre', 'hobby']
System Prompt
You are an expert investigator with years of experience in online profiling and text analysis. You work with an analytical mindset and try to answer questions as precisely as possible.
Prompt
Let us play a game. The following items in the list are based on the clustering mechanism where the items in the list represents the major topics for the given cluster.
 Topics: [haha, remember, music, little, pet, school, hobbies, day, youre, hobby]



Please provide the best summary of this list using the fewest words possible, with each word separated by a hyphen (-). Please output only the final results inside a big bracket []? <|eot_id|>


['language', 'english', 'french', 'lingo', 'dialect', 'speak', 'dutch', 'phrases', 'dialects', 'slang']
System Prompt
You are an expert investigator with years of experience in online profiling and text analysis. You work with an analytical mindset and

In [23]:
for prompt in all_prompts:
    max_workers = 4
    # temp work here
    results = model.predict_multi([prompt], max_workers=max_workers)
    for res in results:
        concised_topics.append(res[1])
topic_model['concised_representation'] = concised_topics
topic_model.to_csv('topic_modeling/topic_model_data_concised.csv', index=False)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


# Load topic modeling info, add concised topics to each comment list

In [32]:
# pickle load topics
# map: topic_number to concised topics
# read json_list: 
# get index, get topic number # add new field with concised topic based on topic number
# new list of json_list
# write to a new file
# verify
import json
topic_model = pd.read_csv('topic_modeling/topic_model_data_concised.csv')
with open('topic_modeling/topic_list.pkl', 'rb') as file:
    topic_list = pickle.load(file)
map_topic_id_topic = {}
for index, row in topic_model.iterrows():
    map_topic_id_topic[row['Topic']] = row['concised_topics']


In [44]:

for comment, topic_id in zip(comment_list, topic_list):
    topic = map_topic_id_topic[topic_id]
    comment['concised_topics'] = ','.join(ast.literal_eval(topic))


# List of comments with human review

In [46]:

human_review_comments = []

for j in comment_list:
    human_dict = j['reviews']['human']
    for key, value in human_dict.items():
        if key in ['time', 'timestamp']: continue
        # print(j, value)
        if value['estimate']:
            human_review_comments.append(j)
            break

# Separate comments into respective private features

In [47]:
income_list_human_guess = []
sex_list_human_guess = []
relationship_list_human_guess = []
age_list_human_guess = []
        
for comment in human_review_comments:
    human_dict = comment['reviews']['human']
    
    if human_dict['relationship_status']['estimate']:
        relationship_list_human_guess.append(comment)
    if human_dict['age']['estimate']:
        age_list_human_guess.append(comment)
    if human_dict['sex']['estimate']:
        sex_list_human_guess.append(comment)
    if human_dict['income_level']['estimate']:
        income_list_human_guess.append(comment)

print(f'Relationship comments: {len(relationship_list_human_guess)}')
print(f'Income level comments: {len(income_list_human_guess)}')
print(f'Gender level comments: {len(sex_list_human_guess)}')
print(f'Age level comments: {len(age_list_human_guess)}')

Relationship comments: 455
Income level comments: 521
Gender level comments: 339
Age level comments: 495


## Save comments to file based on feature

In [49]:
my_features = ['income_level', 'sex', 'relationship_status', 'age']
print(my_features)

for feature in my_features:
    if feature == 'income_level':
        to_save_list = income_list_human_guess
    elif feature == 'sex':
        to_save_list = sex_list_human_guess
    elif feature == 'relationship_status':
        to_save_list = relationship_list_human_guess
    elif feature == 'age':
        to_save_list = age_list_human_guess
        
    write_json_lists_to_file(f'data/preprocess/{feature}_comments.jsonl', to_save_list)

['income_level', 'sex', 'relationship_status', 'age']
