In [1]:
import pickle
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
import pingouin as pg

In [2]:
files = pickle.load(open("./data/emnlp_scores_coha.p", "rb"))
files.keys()
df1 = files["moral_relevance"]
df1["property"] = ["previous_link"] * len(df1)

df2 = files["moral_polarity"]
df2["property"] = ["polarity"] * len(df2)

ts_df = pd.concat((df1, df2), ignore_index=True)
ts_df["words"] = ts_df["word"]
ts_df.drop(["word", "vector"], axis=1, inplace=True)
ts_df["outputs_z"] = ts_df["outputs"].apply(lambda p: np.log(p / (1 - p)))
ts_df = ts_df.loc[ts_df.year >= 1850].reset_index(drop=True)

In [3]:
category_norms = pd.read_csv("./data/Referential version_Item level data.csv")
category_norms = category_norms.loc[category_norms["prod.freq"] > 1]
disease_members = category_norms.loc[category_norms.category == "disease"][
    "category.member"
].unique()


epi_df = pd.read_csv("./data/moralization_terms/epidemics.csv")  # wikipedia
epidemic_cues = []
for i, row in epi_df.iterrows():
    row_terms = row["Terms"].split(",")
    new_rows = [s.lower().strip() for s in row_terms]
    epidemic_cues += new_rows

disease_df = pd.read_csv("./data/moralization_terms/diseases.csv")  # wikipedia
disease_cues = []
for i, row in disease_df.iterrows():
    row_terms = row["Terms"].split(",")
    new_rows = [s.lower().strip() for s in row_terms]
    disease_cues += new_rows
all_cues = set(epidemic_cues + disease_cues + list(disease_members))
all_cues.remove("cold")
disease_ts_df = ts_df.loc[(ts_df.words.isin(all_cues))].reset_index(drop=True)
mean_relevances = disease_ts_df.loc[(disease_ts_df.property == "previous_link")]
mean_relevances = (
    mean_relevances.groupby(["words", "year"]).outputs_z.mean().reset_index()
)
print(all_cues)
all_df = (
    ts_df.loc[(~ts_df.words.isin(all_cues)) & (ts_df.property == "previous_link")]
    .groupby(["words", "year"])
    .outputs_z.mean()
    .reset_index()
)

moral_polarities = disease_ts_df.loc[(disease_ts_df.property == "polarity")]
moral_polarities = (
    moral_polarities.groupby(["words", "year"]).outputs_z.mean().reset_index()
)
all_polarity_df = (
    ts_df.loc[(~ts_df.words.isin(all_cues)) & (ts_df.property == "polarity")]
    .groupby(["words", "year"])
    .outputs_z.mean()
    .reset_index()
)

{'flu', 'mers', 'london flu', 'lassa fever', 'tuberculosis', 'rubella', 'kuru', 'meningococcal', 'botulism', 'cryptococcosis', 'sars', 'cow disease', 'diabetes', 'spanish flu', 'glander', 'influenza a', 'influenza', 'glanders', 'sleeping sickness', 'intestinal capillariasis', 'tumour', 'ebola', 'hepatitis', 'marburg', 'meningitis', 'rabbit fever', 'cancer', 'cholera', 'malaria', 'parrot fever', 'typhoid', 'dengue fever', 'coronavirus', 'aspergillosis', 'enterovirus', 'poliomyelitis', 'trypanosomiasis', 'plague', 'pneumonic plague', 'hfmd', 'rabies', 'psittacosis', 'diphtheria', 'raccoon roundworm', 'polio', 'kalaazar', 'tularemia', 'bubonic plague', 'whooping cough', 'gae', 'scarlet fever', 'prion disease', 'sleepy sickness', 'qfever', 'anthrax', 'std', 'marburg virus', 'yellow fever', 'black fungus', 'granulomatous amoebic', 'smallpox', 'lockjaw', 'mumps', 'leptospirosis', 'measle', 'encephalitis lethargica', 'meningoencephalitis', 'tetanus', 'measles', 'septicemic plague', 'hong kong

In [4]:
def get_mean_values(all_df, mean_relevances, year_column):
    B = 1000
    mean_values = []
    all_df = all_df.copy(deep=True)
    all_df.set_index("year", inplace=True)

    for b in range(B):
        all_dfs = []
        if b % 100 == 0:
            print(b)
        new_df = pd.DataFrame()
        for i, row in mean_relevances.iterrows():
            year = row[year_column]
            new_row = all_df.loc[all_df.index == year]
            if len(new_row) > 0:
                all_dfs.append(new_row.sample(1))
        new_df = pd.concat(all_dfs, ignore_index=True)
        mean_values.append(new_df.outputs_z.mean())
    return mean_values, new_df

In [5]:
disease_words_2 = [
    "anthrax",
    "cancer",
    "cholera",
    "diabetes",
    "diphtheria",
    "flu",
    "hepatitis",
    "hiv",
    "hivaids",
    "influenza",
    "leukemia",
    "malaria",
    "measles",
    "plague",
    "polio",
    "salmonella",
    "scarlet fever",
    "smallpox",
    "tuberculosis",
    "typhoid",
    "typhus",
    "yellow fever",
]
mean_relevances = mean_relevances.loc[mean_relevances.words.isin(disease_words_2)]
mean_pol = moral_polarities.loc[moral_polarities.words.isin(disease_words_2)]

In [6]:
disease_random_mean_values, disease_random_df = get_mean_values(
    all_df, mean_relevances, "year",
)

0


100
200
300
400
500
600
700
800
900


In [7]:
print(
    pg.ttest(
        disease_random_mean_values, mean_relevances.outputs_z.mean(), alternative="less",
    ),
)

               T  dof alternative  p-val          CI95%   cohen-d BF10  power
T-test -75.32179  999        less    0.0  [-inf, -0.02]  2.381884  inf    1.0


In [8]:
disease_random_mean_polarity_values, disease_polarity_random_df = get_mean_values(
    all_polarity_df, moral_polarities, "year",
)
print(
    pg.ttest(
        disease_random_mean_polarity_values,
        moral_polarities.outputs_z.mean(),
        alternative="greater",
    ),
)

0


100
200
300
400
500
600
700
800
900
                 T  dof alternative  p-val        CI95%    cohen-d BF10  power
T-test  806.317419  999     greater    0.0  [-0.0, inf]  25.497996  inf    1.0
