In [1]:
import warnings

import pandas as pd
import pingouin as pg

warnings.filterwarnings("ignore")

In [26]:
# Loading COHA time series data
ts_df = pd.read_csv("./data/SWOW_prediction/eval/time_series/ts_df.csv")
ts_df = (
    ts_df.groupby(["words", "year", "property"])[["count", "outputs_z"]]
    .mean()
    .reset_index()
)
ts_df["property_z"] = ts_df["outputs_z"]

In [27]:
##Loading the sentiment df
sent_df = pd.read_csv("./data/SWOW_prediction/eval/coha_sentiments.csv")

In [28]:
ts_df = ts_df.merge(sent_df, on=["words", "year"], how="left").dropna()

In [29]:
def get_disease_dataframe():
    category_norms = pd.read_csv(
        "./data/Referential version_Item level data.csv",
    )  # diseases from category norm dataset
    category_norms = category_norms.loc[category_norms["prod.freq"] > 1]
    disease_members = category_norms.loc[category_norms.category == "disease"][
        "category.member"
    ].unique()

    epi_df = pd.read_csv(
        "./data/moralization_terms/epidemics.csv",
    )  # wikipedia list of epidemics
    epidemic_cues = []
    for i, row in epi_df.iterrows():
        row_terms = row["Terms"].split(",")
        new_rows = [s.lower().strip() for s in row_terms]
        epidemic_cues += new_rows

    disease_df = pd.read_csv(
        "./data/moralization_terms/diseases.csv",
    )  # wikipedia list of diseases
    disease_cues = []
    for i, row in disease_df.iterrows():
        row_terms = row["Terms"].split(",")
        new_rows = [s.lower().strip() for s in row_terms]
        disease_cues += new_rows

    all_cues = set(epidemic_cues + disease_cues + list(disease_members))  # All diseases
    all_cues.remove("cold")  # Removing ambiguous term 'cold'

    disease_ts_df = ts_df.loc[
        (ts_df.words.isin(all_cues)) & (ts_df["count"] >= 50)
    ].reset_index(drop=True)

    # Calculating the mean relevance of disease terms at each decade
    mean_relevances = disease_ts_df.loc[(disease_ts_df.property == "previous_link")]
    mean_relevances = (
        mean_relevances.groupby(["words", "year"])[["outputs_z", "count", "sentiments"]]
        .mean()
        .reset_index()
    )

    # Calculating the mean relevance of disease terms at each decade
    moral_polarities = disease_ts_df.loc[(disease_ts_df.property == "polarity")]
    moral_polarities = (
        moral_polarities.groupby(["words", "year"])[
            ["outputs_z", "count", "sentiments"]
        ]
        .mean()
        .reset_index()
    )

    # Creating control dataframe without disease terms
    all_df = (
        ts_df.loc[
            (ts_df["count"] >= 50)
            & (~ts_df.words.isin(all_cues))
            & (ts_df.property == "previous_link")
        ]
        .groupby(["words", "year"])[["outputs_z", "count", "sentiments"]]
        .mean()
        .reset_index()
    )

    # Creating control dataframe without disease terms (for morality)
    all_polarity_df = (
        ts_df.loc[
            (ts_df["count"] >= 50)
            & (~ts_df.words.isin(all_cues))
            & (ts_df.property == "polarity")
        ]
        .groupby(["words", "year"])[["outputs_z", "count", "sentiments"]]
        .mean()
        .reset_index()
    )

    return (
        all_cues,
        disease_ts_df,
        mean_relevances,
        moral_polarities,
        all_df,
        all_polarity_df,
    )

In [30]:
all_cues, disease_ts_df, mean_relevances, moral_polarities, all_df, all_polarity_df = (
    get_disease_dataframe()
)

In [31]:
mean_relevances.sample(4)  # (disease, year, moral relevance,count)

Unnamed: 0,words,year,outputs_z,count,sentiments
127,typhoid,1930,1.611714,75.0,-0.135028
33,flu,1930,0.591219,57.0,-0.411268
98,polio,2000,1.10819,70.0,-0.015781
52,influenza,1980,1.391527,50.0,-0.175424


In [32]:
all_df.sample(4)  # (non-disease, year, moral relevance,count)

Unnamed: 0,words,year,outputs_z,count,sentiments
30906,conscientious,1960,1.055703,166.0,0.001174
17465,branch,1960,-0.113129,1348.0,0.087313
17605,bravo,1970,-0.679645,57.0,0.089454
6372,anthony,1930,0.937651,813.0,0.074247


In [35]:
def get_mean_values_freq_controlled(all_df, mean_relevances, year_column, k=20):
    """Function to create create a bootstrap sample of the mean relevance of non-disease terms for controlling

    Args:
        all_df (pd.DataFrame): Dataframe containing the moral relevance of all words that are not disease terms
        mean_relevances (pd.DataFrame): Dataframe containing the relevance of disease terms
        year_column (str): Column name for the year in the dataframe
        k (int): Number of k-nearest neighbors to sample from the control data

    Returns:
        mean_values (list): List of mean relevance values for the bootstrap samples
        new_df (pd.DataFrame): Dataframe containing a randomly sampled control data

    """
    B = 1000
    mean_values = []
    all_df = all_df.copy(deep=True)
    all_df.set_index("year", inplace=True)

    for b in range(B):
        all_dfs = []
        if b % 100 == 0:
            print(b)
        new_df = pd.DataFrame()
        for i, row in mean_relevances.iterrows():
            year = row[year_column]
            row_sentiment = row["sentiments"]
            new_row = all_df.loc[all_df.index == year]
            new_row["sent_diff"] = abs(new_row["sentiments"] - row_sentiment)
            new_row = new_row.sort_values("sent_diff")
            if len(new_row) > 0:
                new_row = new_row.head(k)
                all_dfs.append(new_row.sample(1))
        new_df = pd.concat(all_dfs, ignore_index=True)
        mean_values.append(new_df.outputs_z.mean())
    return mean_values, new_df

In [36]:
disease_random_mean_values, disease_random_df = get_mean_values_freq_controlled(
    all_df, mean_relevances, "year", k=5,
)

0
100
200
300
400
500
600
700
800
900


### Let's compare the moral relevance of diseases terms with a bootstrapped mean of moral relevance for non-disease terms

In [23]:
print(
    pg.ttest(
        disease_random_mean_values, mean_relevances.outputs_z.mean(), alternative="less",
    ),
)  # diseases are more relevant than random terms

                 T  dof alternative  p-val          CI95%    cohen-d BF10  \
T-test -711.961375  999        less    0.0  [-inf, -0.02]  22.514195  inf   

        power  
T-test    1.0  


In [38]:
# Reapting the process for moral polarity
disease_random_mean_polarity_values, disease_polarity_random_df = (
    get_mean_values_freq_controlled(all_polarity_df, moral_polarities, "year", k=5)
)

0
100
200
300
400
500
600
700
800
900


In [39]:
print(
    pg.ttest(
        disease_random_mean_polarity_values,
        moral_polarities.outputs_z.mean(),
        alternative="greater",
    ),
)  # diseases are more morally negative than random terms

                 T  dof alternative  p-val         CI95%    cohen-d BF10  \
T-test  428.599484  999     greater    0.0  [-0.94, inf]  13.553506  inf   

        power  
T-test    1.0  
