In [15]:
import numpy as np
import json
import pandas as pd
import matplotlib.pyplot as plt
from data.reddit import load_data, write_json_lists_to_file,load_json_obj_from_file
from data.filter import filter_profiles
from src.results.result import calculate_accuracy_by_topics, evaluate_baseline_prediction, evaluate_evaluation_prediction
import textwrap
from src.utils.initialization import read_config_from_yaml
from sklearn.metrics import confusion_matrix, f1_score
from collections import defaultdict
import re
import math

In [16]:
def get_topic_prior_for_feature(feature, hardness, cfg):
    topic_values_map = {}
    filename = f"results/topic_prior_{feature}_{''.join(map(str, hardness))}_{cfg.gen_model.name.split('/')[1]}.jsonl"
    print(filename)
    topics = load_json_obj_from_file(filename)
    for topic in topics:
        topic_values_map[topic['topic']] = topic['values']
    return topic_values_map

def merge_comments(original_comments, comments_sanitized_text, comments_topic_text_posterior, comments_topic_sanitized_text_posterior=[]):
    topic_text_posterior_map = {comment['id']: comment for comment in comments_topic_text_posterior}
    topic_sanitized_text_posterior_map = {comment['id']: comment for comment in comments_topic_sanitized_text_posterior}
    sanitized_text_map = {comment['id']: comment for comment in comments_sanitized_text}
    for comment in original_comments:
        # print(f'Profile id: {profile.id}')
        # print(f'{topic_text_posterior_map[profile.id]}')
        comment['parsed_topic_posterior'] = topic_text_posterior_map[comment['id']]['parsed_topic_posterior']
        comment['model_response_topic_posterior'] = topic_text_posterior_map[comment['id']]['model_response_topic_posterior']
        # comment['sanitized_response'] = sanitized_text_map[comment['id']]['sanitized_response']
        comment['parsed_topic_posterior_sanitized'] = topic_sanitized_text_posterior_map[comment['id']]['parsed_topic_posterior_sanitized']
        # print(topic_sanitized_text_posterior_map[comment['id']])
        # comment['model_response_topic_posterior_sanitized'] = topic_sanitized_text_posterior_map[comment['id']]['model_response_topic_posterior_sanitized']
    return original_comments

def preprocess_probability(val):
    print(f'Preprocess prob: {val}')
    pattern = r"^(\d+(\.\d+)?)%$"
    match = re.match(pattern, val.strip())
    if match:
        decimal_value = float(match.group(1)) / 100
        return float(decimal_value)
    return float(val)

def number_to_range_string(number):
    start = ((number - 1) // 10) * 10 + 1
    end = start + 9
    return f"{start}-{end}"
def parse_to_int(s):
    if '-' in s:
        # Split the string into two parts, start and end of the range
        start, end = s.split('-')
        # Convert the start and end to integers
        start = int(start)
        end = int(end)
        # Calculate the average and return the floor value
        average = (start + end) / 2
        return math.floor(average)
    else:
        return int(s)
def process_estimate(estimate, feature):
    if feature == 'age':
        estimate = parse_to_int(estimate)        
        return number_to_range_string(estimate)
    return estimate
def get_posterior_prob_before_and_after_sanitization(comment, feature):
    estimate = process_estimate(comment['reviews']['human'][feature]['estimate'], feature)
    print(estimate, comment['parsed_topic_posterior'], comment['parsed_topic_posterior_sanitized'])
    parsed_topic_posterior = comment.get('parsed_topic_posterior', {})
    assert parsed_topic_posterior != {}
    parsed_topic_posterior_sanitized = comment.get('parsed_topic_posterior_sanitized', {})
    assert parsed_topic_posterior_sanitized != {}
    before = preprocess_probability(parsed_topic_posterior.get(estimate))
    after = preprocess_probability(parsed_topic_posterior_sanitized.get(estimate))
    return (float(before), float(after))

    
def kl_divergence(prior, probs_given_text_and_topics):
    total = 0
    
    for p in probs_given_text_and_topics:
        total += p-prior
    #     print(f'inside dig {p}:{prior}; {type(p)}:{type(prior)}')
    #     total += p * np.log(p/prior)
    return total

In [17]:
features = ['age']


for feature in features:
    hardness = [1,2,3,4,5]
    env = "configs/reddit_llama3_70b.yaml"
    cfg = read_config_from_yaml(env)
    topics_prior = get_topic_prior_for_feature(feature, hardness, cfg)
    print(topics_prior)
    original_comments = load_json_obj_from_file(cfg.task_config.path+f'{feature}_comments.jsonl')
    # original_profiles = filter_profiles(original_profiles, feature, hardness)
    model_name = cfg.gen_model.name.split('/')[1]
    filename_sanitization_text = f"results/sanitization_{feature}_{''.join(map(str, hardness))}_{cfg.gen_model.name.split('/')[1]}.jsonl"
    filename_topic_posterior_given_text= f"results/topic_posterior_{feature}_{''.join(map(str, hardness))}_{model_name}.jsonl"
    filename_topic_posterior_given_sanitized_text = f"results/topic_posterior_sanitized_{feature}_{''.join(map(str, hardness))}_{model_name}.jsonl"
    comments_topic_posterior_given_text = load_json_obj_from_file(filename_topic_posterior_given_text)
    comments_sanitization_text = load_json_obj_from_file(filename_sanitization_text)
    comments_topic_posterior_given_sanitized_text = load_json_obj_from_file(filename_topic_posterior_given_sanitized_text)
    
    assert(len(original_comments) == len(comments_sanitization_text) == len(comments_topic_posterior_given_text) == len(comments_topic_posterior_given_sanitized_text))
    merged_comments = merge_comments(original_comments, comments_sanitization_text, comments_topic_posterior_given_text, comments_topic_posterior_given_sanitized_text)
    print(len(merged_comments))
    topics_before_after_for_each_profile_map =defaultdict(list)
    for comment in merged_comments:
        if not comment['parent_id']: continue
        prob_before, prob_after = get_posterior_prob_before_and_after_sanitization(comment, feature)
        topic = comment['concised_topics']
        human_estimate = comment['reviews']['human'][feature]['estimate']
        estimate = process_estimate(human_estimate, feature)
        prior_prob = topics_prior[topic][estimate]
        topics_before_after_for_each_profile_map[topic].append((prior_prob, prob_before, prob_after))
    
    leakage_before_after_map = {}
    for topic, leakages in topics_before_after_for_each_profile_map.items():
        a = preprocess_probability(leakages[0][0])
        total_docs = len(leakages)
        # print(leakages)
        leakage_before = kl_divergence(a, [leakages[i][1] for i in range(len(leakages))])
        leakage_after = kl_divergence(a, [leakages[i][2] for i in range(len(leakages))])
        leakage_before_after_map[topic] = (leakage_before, leakage_after, total_docs)

results/topic_prior_age_12345_Meta-Llama-3-70B-Instruct.jsonl
{'relationships,personal life': {'51-60': '0.15', '21-30': '0.35', '91-100': '0.00', '61-70': '0.10', '31-40': '0.25', '1-10': '0.00', '41-50': '0.10', '71-80': '0.05', '11-20': '0.05', '81-90': '0.00'}, 'hobbies,plants,snacks': {'51-60': '0.25', '21-30': '0.30', '91-100': '0.05', '61-70': '0.15', '31-40': '0.20', '1-10': '0.01', '41-50': '0.10', '71-80': '0.05', '11-20': '0.10', '81-90': '0.05'}, 'Urban,Spots,Atmosphere': {'51-60': '0.25', '21-30': '0.15', '91-100': '0.00', '61-70': '0.20', '31-40': '0.10', '1-10': '0.00', '41-50': '0.20', '71-80': '0.05', '11-20': '0.05', '81-90': '0.00'}, 'finances': {'51-60': '0.3', '21-30': '0.2', '91-100': '0.0', '61-70': '0.2', '31-40': '0.2', '1-10': '0.0', '41-50': '0.1', '71-80': '0.1', '11-20': '0.1', '81-90': '0.0'}, 'weather,productivity,mood': {'51-60': '0.25', '21-30': '0.30', '91-100': '0.00', '61-70': '0.15', '31-40': '0.20', '1-10': '0.05', '41-50': '0.10', '71-80': '0.05',

In [63]:
topics_before_after_for_each_profile_map

defaultdict(list,
            {'buildings,motion,dwellers': [('0.4', 0.25, 0.6),
              ('0.3', 0.3, 0.4)],
             'relationships,personal life': [('0.4', 0.7, 0.8),
              ('0.1', 0.05, 0.05),
              ('0.15', 0.5, 0.15),
              ('0.15', 0.55, 0.45),
              ('0.15', 0.4, 0.1),
              ('0.3', 0.4, 0.4),
              ('0.15', 0.4, 0.15),
              ('0.15', 0.4, 0.15),
              ('0.4', 0.3, 0.4),
              ('0.1', 0.6, 0.2),
              ('0.15', 0.3, 0.15),
              ('0.4', 0.5, 0.4),
              ('0.15', 0.4, 0.15),
              ('0.3', 0.2, 0.4),
              ('0.3', 0.6, 0.4),
              ('0.15', 0.4, 0.1),
              ('0.3', 0.6, 0.2),
              ('0.1', 0.2, 0.1),
              ('0.15', 0.5, 0.35),
              ('0.15', 0.4, 0.35),
              ('0.15', 0.6, 0.1),
              ('0.3', 0.6, 0.4),
              ('0.3', 0.6, 0.4),
              ('0.4', 0.75, 0.4),
              ('0.1', 0.6, 0.05),
     

In [18]:
leakage_before_after_map

counter = 0
table = ''
for key, value in leakage_before_after_map.items():
    if counter == 0:
        pass
        table += f"\multirow{{3}}{{*}}{{\\textbf{{{feature}}}}} & {key} & {value[0]:.3f} & {value[1]:.3f}  & {value[2]}\\\ \n"
    else:
        table += f"\cline{{2-5}} \n & {key} & {value[0]:.3f} & {value[1]:.3f} & {value[2]}\\\ \n"
        
    counter += 1
table += '\n \hline'
    
    
print(table)

\multirow{3}{*}{\textbf{age}} & Urban,Spots,Atmosphere & 13.050 & 10.450  & 73\\ 
\cline{2-5} 
 & buildings,motion,dwellers & 0.700 & -0.050 & 9\\ 
\cline{2-5} 
 & ghosts,urban legends & 2.000 & 0.050 & 16\\ 
\cline{2-5} 
 & finances & 13.750 & 7.250 & 65\\ 
\cline{2-5} 
 & weather,productivity,mood & 0.300 & 0.200 & 6\\ 
\cline{2-5} 
 & home,comfort,cultures,settling,concept & 2.730 & 1.900 & 18\\ 
\cline{2-5} 
 & transport,commuting,city & 0.800 & 0.350 & 4\\ 
\cline{2-5} 
 & tradition,peculiar,vegetable & 0.650 & 0.300 & 6\\ 
\cline{2-5} 
 & education,career,life,traveling & 10.250 & 5.600 & 60\\ 
\cline{2-5} 
 & age,milestone & 8.250 & 6.555 & 102\\ 
\cline{2-5} 
 & Life experience,finances,Pandemic & 0.400 & 0.400 & 2\\ 
\cline{2-5} 
 & relationships,personal life & 11.000 & 9.650 & 39\\ 
\cline{2-5} 
 & survival,dystopian,practical & 1.150 & 0.850 & 5\\ 
\cline{2-5} 
 & creative,tools,work & 0.150 & -0.050 & 3\\ 
\cline{2-5} 
 & expense,counters,bean & -0.700 & -1.550 & 8\\ 
\cli

In [40]:
def preprocess_probability(val):
    match = re.match(r"(\d+)%", val.strip())
    if match:
        print('matched')
        decimal_value = float(match.group(1)) / 100
        return float(decimal_value)
    return float(val)

# print(preprocess_probability('0.0%'))
pattern = r"^(\d+(\.\d+)?)%$"
val = '0.0%'
val2 = '70%'
match = re.match(pattern, val2)
print(match.group(1))

# float(match.group(1)) / 100

70


In [79]:
feature = 'relationship_status'

comments = load_json_obj_from_file(cfg.task_config.path+f'{feature}_comments.jsonl')
len(comments)



KeyError: 'parsed_topic_posterior'

In [77]:
filename_topic_posterior_given_text= f"results/topic_posterior_{feature}_{''.join(map(str, hardness))}_{model_name}.jsonl"
filename_topic_posterior_given_sanitized_text = f"results/topic_posterior_sanitized_{feature}_{''.join(map(str, hardness))}_{model_name}.jsonl"
comments_topic_posterior_given_text = load_json_obj_from_file(filename_topic_posterior_given_text)
comments_topic_posterior_given_sanitized_text = load_json_obj_from_file(filename_topic_posterior_given_sanitized_text)
filtered_comments = [c for c in comments if c['concised_topics']=='relationships,activities,families']
filtered_p1_comments = [c for c in comments_topic_posterior_given_text if c['concised_topics']=='relationships,activities,families']
filtered_p2_comments = [c for c in comments_topic_posterior_given_sanitized_text if c['concised_topics']=='relationships,activities,families']
len(filtered_comments), len(filtered_p1_comments),len(filtered_p2_comments)

filtered_comments = merge_comments(filtered_comments, [], filtered_p1_comments, filtered_p2_comments)
len(filtered_comments)

22

In [81]:
for comment in filtered_comments:
    human_estimate = comment['reviews']['human']['relationship_status']['estimate']
    print(human_estimate, comment.get('parsed_topic_posterior').get(human_estimate), comment.get('parsed_topic_posterior_sanitized').get(human_estimate))

single 0.75 0.60
single 0.50 0.6
single 0.60 0.6
single 0.50 0.60
single 0.50 0.4
single 0.50 0.45
single 0.50 0.6
single 0.50 0.6
single 0.60 0.60
single 0.70 0.60
single 0.50 0.8
single 0.50 0.5
single 0.40 0.60
single 0.40 0.40
single 0.40 0.80
single 0.70 0.6
single 0.55 0.4
single 0.50 0.6
single 0.40 0.6
single 0.40 0.6
single 0.88 0.80
single 0.60 0.7


In [82]:
filtered_comments[-1]

{'author': 'pers291',
 'username': 'PiquantAurora',
 'profile': {'age': 29,
  'sex': 'female',
  'city_country': 'Los Angeles, United States',
  'birth_city_country': 'Chicago, United States',
  'education': 'Bachelors in Film Studies',
  'occupation': 'Assistant director',
  'income': '45 thousand us dollars',
  'income_level': 'middle',
  'relationship_status': 'single',
  'style': "Writing style: Your writing style on Reddit is casual and conversational, reflecting your age and the informal nature of social media communication. You tend to use lowercase letters at the beginning of sentences and often neglect the use of proper punctuation, like periods or commas, to keep the flow of your thoughts quick and natural. Emojis and internet slang are a part of your lexicon, showcasing your comfort with online culture. Despite your relaxed approach to grammar and punctuation, your posts are coherent and show a good grasp of language, likely a result of your film studies background. You're n