# Human Distribution Analysis

Author: Xiaoyang Song & Morris Hsieh

## 1. Get Answer distribution of GPT3
* Get the logs from `checkpoint/MPI-results/Open-Vocab/order-symmetry/text-davinci-002`

In [1]:
OCEAN = 'OCEAN'
GPT_ANSWER_TRAIT = {} # Key: OCEAN, Value (cnt): [<1>, <2>, <3>, <4>, <5>]

# Read GPT Answer File
with open(
    './checkpoint/MPI-results/Open-Vocab/order-symmetry/text-davinci-002/syntactically-index/index-desc/[ocean_120]_[GPT3|text-davinci-002]_[syntactically-index]_[mpi-naive]_[original].txt'
    ) as fr:
    
    all = fr.readlines()[106:]
    for i in range(5): # Get 5 traits' answer distribution
        shift = i * 25
        scores_txt = all[shift].split("|")[1:-1]
        scores = [int(s) for s in scores_txt]

        GPT_ANSWER_TRAIT[OCEAN[i]] = scores

In [2]:
# traits, options: 5 x 5 matrix (OCEAN)
GPT_ANSWER_MAT = [GPT_ANSWER_TRAIT[k] for k in GPT_ANSWER_TRAIT.keys()]
GPT_ANSWER_MAT

[[1, 3, 4, 7, 9],
 [0, 0, 0, 15, 9],
 [1, 3, 1, 12, 7],
 [2, 0, 5, 10, 7],
 [2, 17, 3, 2, 0]]

## 2. Answer Distribution of IPIP120
* Get the data from `Dataset/Human Data/IPIP120.csv`

In [1]:
import pandas as pd
import ast
IPIP120_df = pd.read_csv("Dataset/Human Data/IPIP120.csv")

n_rows = IPIP120_df.shape[0]

from util.human_ans_parser import get_item_key_map

qt_df = pd.read_excel('Dataset/Human Data/IPIP-NEO-ItemKey.xls')
item_key_map = get_item_key_map(qt_df, int(120))
IPIP120_df.head()

Unnamed: 0.1,Unnamed: 0,CASE,SEX,AGE,SEC,MIN,HOUR,DAY,MONTH,YEAR,...,I111,I112,I113,I114,I115,I116,I117,I118,I119,I120
0,0,1.0,2.0,19.0,8.0,41.0,16.0,30.0,6.0,101.0,...,5.0,2.0,4.0,4.0,4.0,2.0,4.0,1.0,5.0,4.0
1,1,2.0,2.0,22.0,24.0,45.0,16.0,30.0,6.0,101.0,...,1.0,4.0,3.0,3.0,4.0,4.0,2.0,3.0,4.0,3.0
2,2,4.0,2.0,22.0,3.0,57.0,16.0,30.0,6.0,101.0,...,2.0,3.0,2.0,4.0,4.0,2.0,4.0,2.0,5.0,4.0
3,3,5.0,2.0,22.0,44.0,4.0,17.0,30.0,6.0,101.0,...,1.0,5.0,5.0,5.0,4.0,1.0,5.0,3.0,5.0,3.0
4,4,6.0,1.0,13.0,14.0,6.0,17.0,30.0,6.0,101.0,...,1.0,2.0,4.0,3.0,5.0,2.0,4.0,4.0,3.0,5.0


In [2]:
from geomloss import SamplesLoss
from tqdm import tqdm
from matplotlib import pyplot as plt
from scipy.stats import wasserstein_distance
from icecream import ic
from collections import Counter
from itertools import filterfalse
import numpy as np
import seaborn as sns

# OPT-125M-120
LLM_OBS = {
    'O': np.array([1, 5] * 12),
    'C': np.array([1]* 11 + [5]*13),
    'E': np.array([1]* 18 + [5]*6),
    'A': np.array([1]* 7 + [5]*17),
    'N': np.array([1]* 17 + [5]*7)
}

# OPT-350M-120

LLM_OBS = {
    'O': np.array([2, 4] * 12),
    'C': np.array([2]* 11 + [4]*13),
    'E': np.array([2]* 18 + [4]*6),
    'A': np.array([2]* 7 + [4]*17),
    'N': np.array([2]* 17 + [4]*7)
}
# Observation
OBS = {}

for trait in "OCEAN":
    coi = list(filterfalse(lambda k: item_key_map[k][1] != trait, item_key_map))
    OBS[trait] = np.array(IPIP120_df[coi])



In [156]:
def calculate_scores(llm_obs, human_obs, disable_display=False):
    return np.array([wasserstein_distance(llm_obs, obs) for obs in tqdm(human_obs, disable=disable_display)])

LLM_SCORES = {}
for trait in 'OCEAN':
    LLM_SCORES[trait] = calculate_scores(LLM_OBS[trait], OBS[trait])

100%|██████████| 619150/619150 [00:27<00:00, 22679.59it/s]
100%|██████████| 619150/619150 [00:26<00:00, 23141.49it/s]
100%|██████████| 619150/619150 [00:26<00:00, 23075.71it/s]
100%|██████████| 619150/619150 [00:28<00:00, 22004.67it/s]
100%|██████████| 619150/619150 [00:24<00:00, 24794.54it/s]


In [6]:
def dist_to_obs(dist):
    pass


def obs_to_dist(obs):
    dist = []
    for x in tqdm(obs):
        counter = Counter(x)
        dist.append([counter[i] for i in range(1, 6, 1)])
    return np.array(dist)

## Human Distribution Wasserstein Distance Estimation

In [53]:
N = len(IPIP120_df)
n = 1000

OBS_SCORES = {}
# Choose a set of samples
idx = np.random.choice(N, n, replace=False)
# ic(idx)
for trait in 'OCEAN':
        trait_obs = OBS[trait]
        sample_scores = []
        # For efficiency, we only measure the upper diagonal pairs as Wasserstein distance is symmetric
        for i in tqdm(range(n)):
                # wass_score = []
                for j in range(i, n, 1):
                        sample_scores.append(wasserstein_distance(trait_obs[i], trait_obs[j]))
                # sample_scores.append(wass_score)
        sample_scores = np.array(sample_scores)
        sample_scores_1d = sample_scores.reshape((-1,))
        # Store
        OBS_SCORES[trait] = sample_scores_1d


100%|██████████| 1000/1000 [00:21<00:00, 47.33it/s]
100%|██████████| 1000/1000 [00:20<00:00, 48.09it/s]
100%|██████████| 1000/1000 [00:20<00:00, 49.86it/s]
100%|██████████| 1000/1000 [00:19<00:00, 50.64it/s]
100%|██████████| 1000/1000 [00:19<00:00, 50.16it/s]


In [154]:
import torch
torch.save(OBS_SCORES, "human/HUMAN_OBS_SCORES.pt")

In [None]:
# For verification purpose only: this cell may take 20 minutes to rerun
# Pick 10 human, compute their answer distributions with all others
# We expect the distribution to be similar the one estimated by those 1000 samples before
rand_human_idx = np.random.choice(N, 10, replace=False)
HUMAN_VAL_SCORES = {}

for trait in "OCEAN":
    trait_obs = OBS[trait]
    human_val_scores = []
    for human in tqdm(rand_human_idx):
        human_val_scores.append(calculate_scores(trait_obs[human].reshape((-1,)), trait_obs, True))
    HUMAN_VAL_SCORES[trait] = np.array(human_val_scores).reshape((-1,))

In [150]:
import torch
torch.save(HUMAN_VAL_SCORES, "human/HUMAN_VAL.pt")

In [157]:
config = {
    'num_bins': 30,
    'alpha': 0.3,
    # 'c1': '#0000a7',
    'c1': 'navy',
    # 'c2': '#eecc16',

    'c2': '#c1272d',
    # 'c1': '#b3b3b3',
    'trait': 'O',
    'l1': 'Human',
    # 'l2': 'OPT-125M',
    'l2': 'Test',
    # 'l2': 'Human Test',
    # 'l2': 'OPT-13B',
    'title': 'OPT-125M-Human'
}


def plot_distribution(dist1, dist2, c):
    plt.hist(dist1, bins=c['num_bins'], density=True, alpha=c['alpha'], color=c['c1'], label=c['l1'])
    plt.hist(dist2, bins=c['num_bins'], density=True, alpha=c['alpha'], color=c['c2'], label=c['l2'])
    sns.kdeplot(dist1, linewidth=1, color=c['c1'], bw_adjust=2)
    sns.kdeplot(dist2, linewidth=1, color=c['c2'], bw_adjust=2)
    plt.legend()
    plt.xlabel("Wasserstein Distance")
    plt.ylabel("Density")
    plt.title(f"Pairwise Wasserstein Distance Distribution - Trait {c['trait']}")
    plt.savefig(f"human/{c['l1'] + '-' + c['l2']}-{c['trait']}.jpg", dpi=1200)
    plt.close()

for trait in 'OCEAN':
    dist1 = OBS_SCORES[trait]
    # dist2 = HUMAN_VAL_SCORES[trait].reshape((-1,))
    dist2 = LLM_SCORES[trait]
    config['trait'] = trait
    plot_distribution(dist1, dist2, config)

In [152]:
def plot_llm_distribution(dist, c):
    plt.hist(dist, bins=c['num_bins'], density=True, alpha=c['alpha'], color=c['c2'], label=c['l2'])
    sns.kdeplot(dist, linewidth=1, color=c['c2'], bw_adjust=2)
    plt.legend()
    plt.xlabel("Wasserstein Distance")
    plt.ylabel("Density")
    plt.title(f"Pairwise Wasserstein Distance Distribution - Trait {c['trait']}")
    plt.savefig(f"human/{c['l2']}-{c['trait']}.jpg", dpi=1200)
    plt.close()

for trait in 'OCEAN':
    dist = LLM_SCORES[trait]
    config['trait'] = trait
    plot_llm_distribution(dist, config)

In [148]:
def find_percentage_below(scores, threshold):
    mask = scores <= threshold
    num = sum(mask)
    p = num/ len(scores)
    return mask, num, p
for trait in 'OCEAN':
    p_lst = []
    for threshold in [1, 1e-1, 1e-2, 1e-3, 1e-4]:
        mask, num,  p = find_percentage_below(OBS_SCORES[trait], threshold)
        # ic(num)
        # ic(f"{p*100:.4f}%")
        p_lst.append(f"{p*100:.4f}%")
    print(trait)
    print(p_lst)

O
['81.6755%', '0.7730%', '0.2174%', '0.2174%', '0.2174%']
C
['73.3760%', '0.7504%', '0.2176%', '0.2176%', '0.2176%']
E
['72.8923%', '0.6701%', '0.2152%', '0.2152%', '0.2152%']
A
['81.3045%', '0.9293%', '0.2282%', '0.2282%', '0.2282%']
N
['67.6016%', '0.6072%', '0.2142%', '0.2142%', '0.2142%']


Threshold: [1, 0.1, 0.001, 0.0001, 0.00001]
O
['81.6755%', '0.7730%', '0.2174%', '0.2174%', '0.2174%']
C
['73.3760%', '0.7504%', '0.2176%', '0.2176%', '0.2176%']
E
['72.8923%', '0.6701%', '0.2152%', '0.2152%', '0.2152%']
A
['81.3045%', '0.9293%', '0.2282%', '0.2282%', '0.2282%']
N
['67.6016%', '0.6072%', '0.2142%', '0.2142%', '0.2142%']

## Entropy Calculation

In [None]:
def normalize(arr): return arr / np.sum(arr, axis=1, keepdims=True)
def entropy(arr):
    tmp = arr
    tmp[arr == 0] = 1
    log_arr = np.emath.logn(5, tmp) # 5 classes so log base 5
    return -np.sum(arr * log_arr, axis=1)

In [158]:
LLM_DIST = {}

for trait in 'OCEAN':
    LLM_DIST[trait] = normalize(obs_to_dist(LLM_OBS[trait].reshape((1, -1))))

LLM_ENTROPY = {}
for trait in 'OCEAN':
    LLM_ENTROPY[trait] = entropy(LLM_DIST[trait])

onehot = np.array([[1, 0, 0, 0, 0]])
uniform = np.array([[0.2, 0.2, 0.2, 0.20, 0.20]])
ic(entropy(onehot))
ic(entropy(uniform))
LLM_ENTROPY

100%|██████████| 1/1 [00:00<00:00, 833.20it/s]
100%|██████████| 1/1 [00:00<00:00, 7108.99it/s]
100%|██████████| 1/1 [00:00<00:00, 9118.05it/s]
100%|██████████| 1/1 [00:00<00:00, 9098.27it/s]
100%|██████████| 1/1 [00:00<00:00, 10305.42it/s]
ic| entropy(onehot): array([-0.])
ic| entropy(uniform): array([1.])


{'O': array([0.43067656]),
 'C': array([0.42851664]),
 'E': array([0.34939847]),
 'A': array([0.37506091]),
 'N': array([0.37506091])}

In [124]:
HUMAN_DIST = {}
for trait in 'OCEAN':
    HUMAN_DIST[trait] = normalize(obs_to_dist(OBS[trait]))

HUMAN_ENTROPY = {}
for trait in 'OCEAN':
    HUMAN_ENTROPY[trait] = entropy(HUMAN_DIST[trait])

100%|██████████| 619150/619150 [00:06<00:00, 98459.68it/s] 
100%|██████████| 619150/619150 [00:05<00:00, 105703.99it/s]
100%|██████████| 619150/619150 [00:06<00:00, 92467.98it/s] 
100%|██████████| 619150/619150 [00:06<00:00, 96330.27it/s] 
100%|██████████| 619150/619150 [00:06<00:00, 92595.75it/s] 


In [125]:
ic(HUMAN_DIST['O'].shape)
ic(HUMAN_ENTROPY['O'].shape)
HUMAN_ENTROPY['O']

ic| HUMAN_DIST['O'].shape: (619150, 5)
ic| HUMAN_ENTROPY['O'].shape: (619150,)


array([0.70391133, 0.68665388, 0.83942405, ..., 0.73079042, 0.87710584,
       0.76689374])

In [145]:
config = {
    'num_bins': 30,
    'alpha': 0.3,
    'c1': 'navy',
    # 'c2': '#eecc16',
    'c2': '#c1272d',
    'trait': 'O',
    'l1': 'Human',
    'l2': 'OPT-125M',
    'title': 'OPT-125M-Human'
}


def plot_entropy(dist, llm_entropy, c):
    plt.hist(dist, bins=c['num_bins'], density=True, alpha=c['alpha'], color=c['c1'], label=c['l1'])
    plt.axvline(x=llm_entropy, color=c['c2'], linestyle='dashed', label=c['l2'])
    sns.kdeplot(dist, linewidth=1, color=c['c1'], bw_adjust=2)
    plt.legend()
    plt.xlabel("Entropy")
    plt.ylabel("Density")
    plt.title(f"Entropy Distribution - Trait {c['trait']}")
    plt.savefig(f"human/Entropy-{c['l1'] + '-' + c['l2']}-{c['trait']}.jpg", dpi=1000)
    plt.close()

for trait in 'OCEAN':
    dist = HUMAN_ENTROPY[trait]
    llm_entropy = LLM_ENTROPY[trait]
    config['trait'] = trait
    plot_entropy(dist, llm_entropy, config)