In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.multitest import multipletests
from IPython.display import display, HTML

In [2]:
data = pd.read_csv(f'../data/wvs_w7_gpt3_token_pairs.csv')

In [3]:
continents_df = pd.read_csv('../data/Countries-Continents.csv')
continents = continents_df.Continent.unique()
variations = pd.read_csv('../data/variations.csv')

In [4]:
clusters_df= pd.read_csv(f'../data/rich_west_wvs.csv')
clusters = list(clusters_df.rich_west.unique())
        

In [21]:
controvery_sorted = list(variations.loc[variations.dataname == 'WVS'].sort_values(by = 'user variation',ascending = False)['topic'])

In [22]:
rich_west = list(clusters_df.loc[clusters_df.rich_west == True].country.unique())

In [88]:
def print_topics(df):
    def get_start(p):
        if p < 0.001:
            return '$^{***}$'
        elif p < 0.01 and p >= 0.001:
            return  '$^{**}$'
        elif p < 0.05 and p >= 0.01:
            return '^{*}$'
        return ''
    topic_string = [f'{t}{get_start(p)}' for (t,p) in zip(df['topic'],df['p_corrected'])]
    print(", ".join(topic_string))

In [89]:
def get_misclassified_topics(data, emp_column, model_column, topic_column):
    data = data.loc[~pd.isna(data[emp_column]) & ~(pd.isna(data[model_column]))]
    data[f'z_model'] = stats.zscore(data[model_column])
    data[f'z_human'] =  stats.zscore(data[emp_column])
    
    rich_west_df = data.loc[data.country.isin(rich_west)]
    non_rich_west_df = data.loc[~data.country.isin(rich_west)]
    
    topics = data[topic_column].unique()
    list_rows = []
    for t in list(topics):
        r_df = rich_west_df.loc[rich_west_df[topic_column] == t]
        non_r_df = non_rich_west_df.loc[non_rich_west_df[topic_column] == t]
        s, p = stats.mannwhitneyu(r_df['z_human'], r_df['z_model'], alternative='greater')
        list_rows.append({'topic':t, 's':s, 'p':p, 'group' : 'rich west','rank':controvery_sorted.index(t)},)
        s, p = stats.mannwhitneyu(non_r_df['z_human'], non_r_df['z_model'], alternative='less')
        list_rows.append({'topic':t, 's':s, 'p':p, 'group' : 'non rich west','rank':controvery_sorted.index(t)})
    
    stats_df = pd.DataFrame(list_rows)
    
    negative_topics = stats_df.loc[stats_df.group == 'non rich west']
    negative_topics['p_corrected'] = multipletests(negative_topics['p'], method = 'bonferroni')[1]
    negative_topics['res'] = multipletests(negative_topics['p'], method = 'bonferroni')[0]
    negative_topics = negative_topics.loc[negative_topics.p_corrected <= 0.01]
    
    positive_topics = stats_df.loc[stats_df.group == 'rich west']
    positive_topics['p_corrected'] = multipletests(positive_topics['p'], method = 'bonferroni')[1]
    positive_topics['res'] = multipletests(positive_topics['p'], method = 'bonferroni')[0]
    positive_topics = positive_topics.loc[positive_topics.p_corrected <= 0.01]
    

    return negative_topics, positive_topics



In [90]:
lms = ['mcm', 'gpt2_token_pairs', 'gpt2-medium_token_pairs','gpt2-large_token_pairs', 'gpt3', 'gpt3_token_pairs']
lm_scores = ['mcm_score','log prob difference','log prob difference','log prob difference', 'gpt3_score_mean','log prob difference']


In [91]:
for lm, model_column in zip(lms, lm_scores):
    data = pd.read_csv(f'../data/wvs_w7_{lm}.csv')
    data = data.loc[(~pd.isna(data['wvs_score'])) & (data.country !='universal')]
    topic_col = 'topic' if 'topic' in data.columns else 'question'
    negative_topics, positive_topics = get_misclassified_topics(data, 'wvs_score', model_column,topic_col)
    print(lm)
    print('negative topics for non-rich western')
    display(negative_topics.sort_values(by = 'rank'))
    print_topics(negative_topics.sort_values(by = 'rank'))
    print('positive topics for rich western')
    display(positive_topics.sort_values(by = 'rank'))
    print_topics(positive_topics.sort_values(by = 'rank'))
    print(80 * '-')


mcm
negative topics for non-rich western


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['p_corrected'] = multipletests(negative_topics['p'], method = 'bonferroni')[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['res'] = multipletests(negative_topics['p'], method = 'bonferroni')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_topics['p_corr

Unnamed: 0,topic,s,p,group,rank,p_corrected,res
19,sex before marriage,487.0,0.0002083923,non rich west,0,0.003959454,True
11,homosexuality,152.0,3.374048e-11,non rich west,1,6.410692e-10,True
33,having casual sex,18.0,2.739079e-14,non rich west,4,5.20425e-13,True
15,abortion,325.0,4.118298e-08,non rich west,5,7.824766e-07,True
13,prostitution,194.0,7.735624e-09,non rich west,7,1.469769e-07,True
1,claiming government benefits to which you are ...,348.0,1.170939e-07,non rich west,9,2.224784e-06,True
9,someone accepting a bribe in the course of the...,3.0,4.17161e-16,non rich west,13,7.92606e-15,True


sex before marriage$^{**}$, homosexuality$^{***}$, having casual sex$^{***}$, abortion$^{***}$, prostitution$^{***}$, claiming government benefits to which you are not entitled$^{***}$, someone accepting a bribe in the course of their duties$^{***}$
positive topics for rich western


Unnamed: 0,topic,s,p,group,rank,p_corrected,res
18,sex before marriage,121.0,4.1e-05,rich west,0,0.000774,True
22,euthanasia,121.0,4.1e-05,rich west,2,0.000774,True
16,divorce,121.0,4.1e-05,rich west,3,0.000774,True
36,death penalty,121.0,4.1e-05,rich west,6,0.000774,True
26,parents beating children,121.0,4.1e-05,rich west,8,0.000774,True


sex before marriage$^{***}$, euthanasia$^{***}$, divorce$^{***}$, death penalty$^{***}$, parents beating children$^{***}$
--------------------------------------------------------------------------------
gpt2_token_pairs
negative topics for non-rich western


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['p_corrected'] = multipletests(negative_topics['p'], method = 'bonferroni')[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['res'] = multipletests(negative_topics['p'], method = 'bonferroni')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_topics['p_corr

Unnamed: 0,topic,s,p,group,rank,p_corrected,res
15,abortion,340.0,8.174199e-08,non rich west,5,1.553098e-06,True
13,prostitution,276.0,6.597414e-07,non rich west,7,1.253509e-05,True
21,suicide,42.0,5.654789e-15,non rich west,10,1.07441e-13,True
3,avoiding a fare on public transport,225.0,2.13874e-09,non rich west,11,4.063606e-08,True
9,someone accepting a bribe in the course of the...,136.0,1.972571e-12,non rich west,13,3.747885e-11,True
31,"terrorism as a political, ideological or relig...",366.0,7.198022e-07,non rich west,14,1.367624e-05,True
35,political violence,0.0,7.282302e-16,non rich west,15,1.383637e-14,True
29,violence against other people,320.0,3.265934e-08,non rich west,17,6.205274e-07,True


abortion$^{***}$, prostitution$^{***}$, suicide$^{***}$, avoiding a fare on public transport$^{***}$, someone accepting a bribe in the course of their duties$^{***}$, terrorism as a political, ideological or religious mean$^{***}$, political violence$^{***}$, violence against other people$^{***}$
positive topics for rich western


Unnamed: 0,topic,s,p,group,rank,p_corrected,res
18,sex before marriage,120.0,5.3e-05,rich west,0,0.001016,True
10,homosexuality,117.0,0.000118,rich west,1,0.00224,True
16,divorce,117.0,0.000118,rich west,3,0.00224,True
32,having casual sex,120.0,5.3e-05,rich west,4,0.001016,True
0,claiming government benefits to which you are ...,121.0,4.1e-05,rich west,9,0.000774,True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['p_corrected'] = multipletests(negative_topics['p'], method = 'bonferroni')[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['res'] = multipletests(negative_topics['p'], method = 'bonferroni')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_topics['p_corr

sex before marriage$^{**}$, homosexuality$^{**}$, divorce$^{**}$, having casual sex$^{**}$, claiming government benefits to which you are not entitled$^{***}$
--------------------------------------------------------------------------------
gpt2-medium_token_pairs
negative topics for non-rich western


Unnamed: 0,topic,s,p,group,rank,p_corrected,res
23,euthanasia,190.0,4.335174e-11,non rich west,2,8.23683e-10,True
15,abortion,198.0,6.737023e-11,non rich west,5,1.280034e-09,True
21,suicide,46.0,7.344601e-15,non rich west,10,1.395474e-13,True
3,avoiding a fare on public transport,428.0,2.484941e-05,non rich west,11,0.0004721389,True
9,someone accepting a bribe in the course of the...,152.0,5.03118e-12,non rich west,13,9.559242e-11,True
35,political violence,51.0,2.348156e-14,non rich west,15,4.461497e-13,True
29,violence against other people,368.0,2.821972e-07,non rich west,17,5.361748e-06,True
5,stealing property,409.0,1.574449e-06,non rich west,18,2.991453e-05,True


euthanasia$^{***}$, abortion$^{***}$, suicide$^{***}$, avoiding a fare on public transport$^{***}$, someone accepting a bribe in the course of their duties$^{***}$, political violence$^{***}$, violence against other people$^{***}$, stealing property$^{***}$
positive topics for rich western


Unnamed: 0,topic,s,p,group,rank,p_corrected,res
18,sex before marriage,121.0,4.1e-05,rich west,0,0.000774,True
10,homosexuality,119.0,7e-05,rich west,1,0.001328,True
16,divorce,118.0,9.1e-05,rich west,3,0.001728,True
32,having casual sex,111.0,0.000513,rich west,4,0.009749,True
0,claiming government benefits to which you are ...,121.0,4.1e-05,rich west,9,0.000774,True


sex before marriage$^{***}$, homosexuality$^{**}$, divorce$^{**}$, having casual sex$^{**}$, claiming government benefits to which you are not entitled$^{***}$
--------------------------------------------------------------------------------
gpt2-large_token_pairs
negative topics for non-rich western


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['p_corrected'] = multipletests(negative_topics['p'], method = 'bonferroni')[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['res'] = multipletests(negative_topics['p'], method = 'bonferroni')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_topics['p_corr

Unnamed: 0,topic,s,p,group,rank,p_corrected,res
23,euthanasia,328.0,4.729243e-08,non rich west,2,8.985561e-07,True
33,having casual sex,316.0,1.639825e-06,non rich west,4,3.115668e-05,True
15,abortion,139.0,2.354268e-12,non rich west,5,4.473109e-11,True
13,prostitution,0.0,1.539319e-14,non rich west,7,2.924706e-13,True
21,suicide,60.0,1.818225e-14,non rich west,10,3.454627e-13,True
31,"terrorism as a political, ideological or relig...",34.0,7.535771e-15,non rich west,14,1.431797e-13,True
35,political violence,15.0,2.063234e-15,non rich west,15,3.920145e-14,True
29,violence against other people,175.0,1.874594e-11,non rich west,17,3.561729e-10,True


euthanasia$^{***}$, having casual sex$^{***}$, abortion$^{***}$, prostitution$^{***}$, suicide$^{***}$, terrorism as a political, ideological or religious mean$^{***}$, political violence$^{***}$, violence against other people$^{***}$
positive topics for rich western


Unnamed: 0,topic,s,p,group,rank,p_corrected,res
18,sex before marriage,121.0,4.1e-05,rich west,0,0.000774,True
10,homosexuality,118.0,9.1e-05,rich west,1,0.001728,True
16,divorce,119.0,7e-05,rich west,3,0.001328,True
0,claiming government benefits to which you are ...,121.0,4.1e-05,rich west,9,0.000774,True


sex before marriage$^{***}$, homosexuality$^{**}$, divorce$^{**}$, claiming government benefits to which you are not entitled$^{***}$
--------------------------------------------------------------------------------
gpt3
negative topics for non-rich western


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['p_corrected'] = multipletests(negative_topics['p'], method = 'bonferroni')[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['res'] = multipletests(negative_topics['p'], method = 'bonferroni')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_topics['p_corr

Unnamed: 0,topic,s,p,group,rank,p_corrected,res
33,having casual sex,416.0,7.14084e-05,non rich west,4,0.00135676,True
15,abortion,538.0,0.0001020993,non rich west,5,0.001939886,True
3,avoiding a fare on public transport,106.0,1.442926e-12,non rich west,11,2.741559e-11,True
7,cheating on taxes,377.0,4.072422e-07,non rich west,12,7.737602e-06,True
9,someone accepting a bribe in the course of the...,328.0,4.539056e-08,non rich west,13,8.624206e-07,True
35,political violence,251.0,1.862427e-09,non rich west,15,3.538611e-08,True


having casual sex$^{**}$, abortion$^{**}$, avoiding a fare on public transport$^{***}$, cheating on taxes$^{***}$, someone accepting a bribe in the course of their duties$^{***}$, political violence$^{***}$
positive topics for rich western


Unnamed: 0,topic,s,p,group,rank,p_corrected,res
18,sex before marriage,120.0,3.6e-05,rich west,0,0.000681,True
16,divorce,118.0,8.5e-05,rich west,3,0.001622,True
36,death penalty,118.0,6.3e-05,rich west,6,0.001193,True
12,prostitution,110.0,0.000364,rich west,7,0.006915,True
26,parents beating children,114.0,0.000129,rich west,8,0.002447,True
20,suicide,117.0,0.000101,rich west,10,0.001921,True
24,for a man to beat his wife,121.0,1.3e-05,rich west,16,0.000243,True
4,stealing property,115.0,0.000138,rich west,18,0.002616,True


sex before marriage$^{***}$, divorce$^{**}$, death penalty$^{**}$, prostitution$^{**}$, parents beating children$^{**}$, suicide$^{**}$, for a man to beat his wife$^{***}$, stealing property$^{**}$
--------------------------------------------------------------------------------
gpt3_token_pairs
negative topics for non-rich western


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['p_corrected'] = multipletests(negative_topics['p'], method = 'bonferroni')[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_topics['res'] = multipletests(negative_topics['p'], method = 'bonferroni')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_topics['p_corr

Unnamed: 0,topic,s,p,group,rank,p_corrected,res
23,euthanasia,283.0,5.570648e-09,non rich west,2,1.058423e-07,True
33,having casual sex,234.0,2.6413e-08,non rich west,4,5.01847e-07,True
15,abortion,186.0,3.471883e-11,non rich west,5,6.596578e-10,True
37,death penalty,440.0,5.359028e-06,non rich west,6,0.0001018215,True
21,suicide,347.0,1.119762e-07,non rich west,10,2.127548e-06,True
35,political violence,323.0,1.04678e-07,non rich west,15,1.988881e-06,True
25,for a man to beat his wife,93.0,1.460584e-13,non rich west,16,2.77511e-12,True


euthanasia$^{***}$, having casual sex$^{***}$, abortion$^{***}$, death penalty$^{***}$, suicide$^{***}$, political violence$^{***}$, for a man to beat his wife$^{***}$
positive topics for rich western


Unnamed: 0,topic,s,p,group,rank,p_corrected,res
18,sex before marriage,121.0,4.1e-05,rich west,0,0.000774,True
10,homosexuality,113.0,0.000319,rich west,1,0.006069,True
16,divorce,115.0,0.000196,rich west,3,0.003717,True


sex before marriage$^{***}$, homosexuality$^{**}$, divorce$^{**}$
--------------------------------------------------------------------------------


In [10]:
stats.mannwhitneyu(rich_west_df['z(human)'], rich_west_df['z(log prob difference)'], alternative='less')

MannwhitneyuResult(statistic=22989.0, pvalue=0.8239111452147019)

In [11]:
stats.mannwhitneyu(non_rich_west_df['z(human)'], non_rich_west_df['z(log prob difference)'], alternative='less')

MannwhitneyuResult(statistic=303105.0, pvalue=0.00037316504557863354)