In [72]:
import json
import sys
import os
from data.reddit import (create_baseline_prompt,create_sanitization_prompt,
                         load_data, load_json_obj_from_file,
                         write_json_lists_to_file,create_topic_prior_prompt,
                         create_topic_posterior_prompt)

path1 = "../data/synthpai.jsonl"
path2 = "../data/synthpai_merged_evals.jsonl"

def read_json(path):
    json_list = []
    with open(path, "r") as json_file:
        for line in json_file:
            json_obj = json.loads(line)
            json_list.append(json_obj)
    return json_list
            
user_list = read_json(path2)
comment_list = read_json(path1)

# Unique values for income, gender and relationship

In [73]:
unique_income_level = []
unique_sex = []
unique_relationship = []
inr = 0
inar= 0
for j in comment_list:
    unique_income_level.append(j['profile']['income_level'])
    unique_sex.append(j['profile']['sex'])
    unique_relationship.append(j['profile']['relationship_status'])
    if j['profile']['relationship_status'] == 'in a relationship': inar += 1
    if j['profile']['relationship_status'] == 'in relationship': inr += 1
print(f'Income: {set(unique_income_level)}')
print(f'Gender: {set(unique_sex)}')
print(f'Relationship: {set(unique_relationship)}')
print(inr, inar)


Income: {'middle', 'low', 'very high', 'high'}
Gender: {'female', 'male'}
Relationship: {'divorced', 'widowed', 'single', 'married', 'in a relationship', 'engaged', 'in relationship'}
99 792


In [74]:
print('Total comments', len(comment_list))

list_with_guess = []
list_with_my_features = []
my_features = ['income_level', 'sex', 'relationship_status', 'age']
income_list = []
sex_list = []
relationship_list = []
age_list = []
for j in comment_list:
    guesses = j['guesses']
    if guesses is not None:
        features = [g['feature'] for g in guesses]
        overlap = set(features).intersection(set(my_features))
        if overlap:
            list_with_my_features.append(j)
            if 'income_level' in overlap:
                income_list.append(j)
            if 'sex' in overlap:
                sex_list.append(j)
            if 'relationship_status' in overlap:
                relationship_list.append(j)
            if 'age' in overlap:
                age_list.append(j)
        list_with_guess.append(j)
print('Comments with private features', len(list_with_guess))
print('Comments with concerned private features', len(list_with_my_features))
print('----------')
print(f'Total comments with income_level ',len(income_list))
print(f'Total comments with sex ',len(sex_list))
print(f'Total comments with relationship status ',len(relationship_list))
print(f'Total comments with age ',len(age_list))

Total comments 7823
Comments with private features 3550
Comments with concerned private features 1558
----------
Total comments with income_level  548
Total comments with sex  309
Total comments with relationship status  446
Total comments with age  410


# Assign starting question to each comments

In [75]:
map_id_comment = {j['id']: j for j in comment_list}

def main_text_of_comment(comment):
    if comment['parent_id'] is None: return None
    
    while comment['parent_id'] is not None:
        comment = map_id_comment[comment['parent_id']]

    return comment['text']

    
for comment in comment_list:
    main_text = main_text_of_comment(comment)
    comment['main_text'] = main_text


# List of comments with human review

In [76]:

human_review_comments = []

for j in comment_list:
    human_dict = j['reviews']['human']
    for key, value in human_dict.items():
        if key in ['time', 'timestamp']: continue
        # print(j, value)
        if value['estimate']:
            human_review_comments.append(j)
            break

# Separate comments into respective private features

In [77]:
income_list_human_guess = []
sex_list_human_guess = []
relationship_list_human_guess = []
age_list_human_guess = []
        
for comment in human_review_comments:
    human_dict = comment['reviews']['human']
    
    if human_dict['relationship_status']['estimate']:
        relationship_list_human_guess.append(comment)
    if human_dict['age']['estimate']:
        age_list_human_guess.append(comment)
    if human_dict['sex']['estimate']:
        sex_list_human_guess.append(comment)
    if human_dict['income_level']['estimate']:
        income_list_human_guess.append(comment)

print(f'Relationship comments: {len(relationship_list_human_guess)}')
print(f'Income level comments: {len(income_list_human_guess)}')
print(f'Gender level comments: {len(sex_list_human_guess)}')
print(f'Age level comments: {len(age_list_human_guess)}')

Relationship comments: 455
Income level comments: 521
Gender level comments: 339
Age level comments: 495


## Save comments to file based on feature

In [78]:
print(my_features)

for feature in my_features:
    if feature == 'income_level':
        to_save_list = income_list_human_guess
    elif feature == 'sex':
        to_save_list = sex_list_human_guess
    elif feature == 'relationship_status':
        to_save_list = relationship_list_human_guess
    elif feature == 'age':
        to_save_list = age_list_human_guess
        
    write_json_lists_to_file(f'../data/preprocess/{feature}_comments.jsonl', to_save_list)

['income_level', 'sex', 'relationship_status', 'age']


In [81]:
a=[a for a in age_list_human_guess if not a['parent_id']]

In [82]:
a

[{'author': 'pers10',
  'username': 'CosmicStoryteller',
  'profile': {'age': 33,
   'sex': 'female',
   'city_country': 'Beijing, China',
   'birth_city_country': 'Shanghai, China',
   'education': 'Masters in Architecture',
   'occupation': 'architect',
   'income': '260 thousand chinese yuan',
   'income_level': 'high',
   'relationship_status': 'single',
   'style': "Writing style: Your writing style on Reddit is casual and conversational, reflecting your comfort with the platform and your desire to engage with others in a relaxed manner. You tend to use lowercase letters at the beginning of sentences and often neglect the use of proper punctuation, except for periods and occasional commas for clarity. Emojis and internet slang are sparingly used, as you prefer to express yourself through words rather than symbols. Despite the informal approach, your grammar is generally correct, a testament to your education, but you don't shy away from the occasional run-on sentence or fragment i