# Replicate

With this notebook, you can replicate the numbers and figures from our paper.

Setup...

In [None]:
from src.corpus import Corpus
from src.metrics import keyness
from src.corpus_creation import document_retriever as dr
from src.corpus_creation import handle_wordlists as hw
from src.load_data import load_files
from src.metrics import rqtr_lemma
import pandas as pd


%load_ext autoreload
%autoreload 2

Loading the data...

Put the path to your corpus in the variable `CORPUSDIR`.

I assume that the data is a set of json files, each containing a list of lemmata under the key 'lemmas'.
If you have a different format, you need to adjust the code accordingly. The result should be a list of lists of lemmata.

In [None]:
# Put the path to the directory containing the corpus files here
CORPUSDIR = '/home/brunobrocai/Data/MoWiKo/replicate/final_corpus'

docs, metadata = load_files(CORPUSDIR)
corpus = Corpus(docs, metadata)

We define two sets of core terms.

In [None]:
core_terms = [
    [('künstlich', 'Intelligenz'), ('KI',)],
    [('künstlich', 'Intelligenz'), ('Roboter',), ('Chatbot',)],
]

With the two core term sets, we create two study/reference corpora.

In [None]:
study_ref_corpora = {}
for i, core_term in enumerate(core_terms):
    hits = dr.match_wordlist(
        corpus, core_term, min=1
    )

    study_corpus = dr.corpus_from_found(
        hits, source_corpus=corpus,
        goal_corpus='FrequencyCorpus'
    )
    reference_corpus = dr.corpus_from_notfound(
        hits, source_corpus=corpus,
        goal_corpus='FrequencyCorpus'
    )
    study_ref_corpora[i] = (study_corpus, reference_corpus)

Next, define some (admitedly, long and inelegant) functions to calculate the keyness metrics and execute them.

In [None]:
wordlists = {}

In [None]:
def keyness_query(study_corpus, reference_corpus, method, core_terms):
    keynesses = keyness.keyword_list(
        study_corpus, reference_corpus,
        metric=method,
        min_docs=5,
        smoothing=0.0001,
        max_ngram_len=2,
        filter_stopwords=True,
    )
    if method == 'odds_ratio':
        filtered_df = keynesses[(keynesses['Keyness'] > 1.0)]
    elif method == 'log_likelihood_rayson':
        filtered_df = keynesses[(keynesses['Keyness'] > 15.13)]

    filtered_df = hw.top_x_with_core(50, 'Keyness', filtered_df, core_terms)

    wordlist = filtered_df['Term'].tolist()
    return wordlist

def rqtr(
    full_corpus,
    study_corpus, reference_corpus,
    core_terms,
):
    b, core_term =rqtr_lemma.qtr_baseline(
        core_terms, full_corpus
    )
    cooccurence_values = rqtr_lemma.count_cooccurence(
        core_terms,
        full_corpus,
        max_ngram_len=2,
    )
    rqtrn_table = rqtr_lemma.cooccurence_to_metric(
        cooccurence_values,
        b,
        metric='rqtrn',
        min_docs=5,
    )
    keynesses = keyness.keyword_list(
        study_corpus, reference_corpus,
        metric='log_likelihood_rayson',
        min_docs=5,
        smoothing=0.0001,
        max_ngram_len=2,
        filter_stopwords=True,
    )
    filtered_df = rqtrn_table[(rqtrn_table['RQTRN'] > 0)]

    # Add column LL and then filter
    filtered_df['LL'] = rqtrn_table['Term'].map(
        keynesses.set_index('Term')['Keyness']
    )
    filtered_df = filtered_df[filtered_df['LL'] > 15.13]

    filtered_df = hw.top_x_with_core(50, 'RQTRN', filtered_df, core_terms)

    wordlist = filtered_df['Term'].tolist()

    return wordlist

wordlists['rqtr'] = rqtr(
    corpus,
    study_ref_corpora[0][0], study_ref_corpora[0][1],
    core_terms[0],
)
wordlists['rqtr_BT2'] = rqtr(
    corpus,
    study_ref_corpora[1][0], study_ref_corpora[1][1],
    core_terms[1],
)
for method in ['odds_ratio', 'log_likelihood_rayson']:
    wordlists[method] = keyness_query(
        study_ref_corpora[0][0], study_ref_corpora[0][1],
        method,
        core_terms[0]
    )
    wordlists[method + '_BT2'] = keyness_query(
        study_ref_corpora[1][0], study_ref_corpora[1][1],
        method,
        core_terms[1]
    )

The pre-defined queries we can just load from `custom_queries.py`.

In [None]:
from src.misc import custom_queries

wordlists['Baseline'] = custom_queries.BASELINE
wordlists['Subjective (1)'] = custom_queries.SUBJECTIVE_1
wordlists['Subjective (2)'] = custom_queries.SUBJECTIVE_2
wordlists['LLM-Regex'] = custom_queries.KI_REGEX_LLM

The collocations are also best loaded from a file because they take long to compute. If you want to replicate this part, head over to the `collocations.ipynb` notebook and run the code there.

In [None]:
# Counting the cooccurences takes a longer time, so we load precomputed values

def load_collocs_csv(
    filename,
    base_terms=[('künstlich', 'Intelligenz'), 'KI']
):
    df = pd.read_csv(filename)
    df = df[df['Doc_Freq'] > 4]
    filtered_df = hw.top_x_with_core(50, 'Stat', df, core_terms[0])
    wordlist = filtered_df['Term'].tolist()
    return wordlist


wordlists['coll-Absatz-logdice'] = load_collocs_csv('wordlists/collocations/windowsizeParagraph-logdice.csv')
wordlists['coll-Absatz-npmi'] = load_collocs_csv('wordlists/collocations/windowsizeParagraph-npmi.csv')
wordlists['coll-5-logdice'] = load_collocs_csv('wordlists/collocations/windowsize5-logdice.csv')
wordlists['coll-5-npmi'] = load_collocs_csv('wordlists/collocations/windowsize5-nmpi.csv')

Let's save the resulting wordlists so that we can inspect them later.

In [None]:
import os
# Save the wordlists to files

GOALDIR = 'wordlists/queries/'

if not os.path.exists(GOALDIR):
    os.makedirs(GOALDIR)

for name, wordlist in wordlists.items():
    if isinstance(wordlist, str):
        wordlist = [wordlist]
    filename = GOALDIR + name + '.txt'
    if os.path.exists(filename):
        print(f'File {filename} already exists. Skipping.')
        continue
    with open(filename, 'w') as f:
        for term in wordlist:
            f.write(str(term) + '\n')
    print(f'Wordlist {name} saved to {GOALDIR + name}.txt')

Next, define a function that evaluates the queries at different absolute frequency thesholds and run it.

Looking at thresholds from 1 to 30 is enough to see all relevant phenomena.

In [None]:
import re

def results_for_methods(
    wordlists: dict, corpus: Corpus, topics: list,
    min_min: int = 1, max_min: int = 30,
):
    results = {}
    for method, wordlist in wordlists.items():
        if 'Subjective' in method:
            hits = dr.match_wordlist(
                corpus, wordlist, min=1,
                escape=False, flags=re.IGNORECASE,
            )
        elif method == 'LLM-Regex':
            hits = dr.match_regex(
                corpus, wordlist, min=1,
            )
        else:
            hits = dr.match_wordlist(
                corpus, wordlist, min=1,
            )
        result = dr.eval_min(
            corpus, hits, 'gold_label',
            min_min=min_min,
            max_min=max_min,
            topic=topics,
        )
        results[method] = result
    return results

results_maintpc = results_for_methods(
    wordlists, corpus, topics=['1_hauptthema'],
    min_min=1, max_min=25,
)
results_sidetpc = results_for_methods(
    wordlists, corpus, topics=['1_hauptthema', '2_nebenthema'],
    min_min=1, max_min=25,
)

Next, define a function to plot the results as a line graph and run it for main and side topic queries.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def print_plot(
    results: dict, skip_methods: list = None,
    xlabel: str = 'Aggregate Frequency Threshold',
    ylabel: str = 'F1 Score',
):
    # Translate some method names to more readable names
    metric_to_name = {
        'odds_ratio': 'Odds Ratio',
        'log_likelihood_rayson': 'Log Likelihood',
        'rqtr': 'RQTR',
        'Baseline': 'Baseline',
        'Subjective (2)': 'Subjective (2)',
        'coll-Absatz-logdice': 'LogDice (Paragraph)',
        'coll-Absatz-npmi': 'nPMI (Paragraph)',
    }
    plt.figure(figsize=(12, 8))

    sns.set_style("whitegrid")
    palette = sns.color_palette("tab10")
    line_styles = ['-', '--', '-.', ':']
    marker_styles = ['o', 's', '^']

    results = {
        method: result for method, result in results.items()
        if method not in skip_methods
    }

    for idx, (method, result) in enumerate(results.items()):
        i_values = list(result.keys())
        f1_values = [result[i]['f1-score'] for i in i_values]
        plt.plot(
            i_values, f1_values,
            label=f'{metric_to_name.get(method, method)}',
            color=palette[idx],
            linestyle=line_styles[idx % len(line_styles)],
            linewidth=3,
            marker=marker_styles[idx % len(marker_styles)],
            markersize=8
        )

    plt.xlabel(xlabel, fontsize=20)
    plt.ylabel(ylabel, fontsize=20)
    plt.legend(fontsize=18, frameon=True)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    plt.tight_layout()
    plt.show()


do_not_plot = [
        'Subjective (1)',
        'coll-5-logdice',
        'coll-5-npmi',
        'odds_ratio_BT2',
        'log_likelihood_rayson_BT2',
        'rqtr_BT2',
        'LLM-Regex'
]


print_plot(
    results_maintpc,
    skip_methods=do_not_plot
)
print_plot(
    results_sidetpc,
    skip_methods=do_not_plot
)

Same as above (test for different thresholds and then visualize) but for PMW.

In [None]:
def results_for_methods_pmw(
    wordlists: dict, corpus: Corpus, topics: list,
    min_min: int = 500, max_min: int = 5000,
    step: int = 500,
):
    results = {}
    for method, wordlist in wordlists.items():

        if 'Subjective' in method:
            hits = dr.match_wordlist_pmw(
                corpus, wordlist, min_pmw=1,
                escape=False, flags=re.IGNORECASE,
            )
        elif method == 'LLM-Regex':
            hits = dr.match_regex_pmw(
                corpus, wordlist, min_pmw=1,
            )
        else:
            hits = dr.match_wordlist_pmw(
                corpus, wordlist, min_pmw=1,
            )

        result = dr.eval_min_pmw(
            corpus, hits, 'gold_label',
            min_min=min_min,
            max_min=max_min,
            steps=step,
            topic=topics,
        )

        results[method] = result

    return results

results_pmw_maintpc = results_for_methods_pmw(
    wordlists, corpus, topics=['1_hauptthema'],
    min_min=1000, max_min=40001,
    step=1000,
)
results_pmw_sidetpc = results_for_methods_pmw(
    wordlists, corpus, topics=['1_hauptthema', '2_nebenthema'],
    min_min=1000, max_min=40001,
    step=1000,
)

In [None]:
print_plot(
    results_pmw_maintpc,
    skip_methods=do_not_plot,
        xlabel='PMW Frequency Threshold',
)
print_plot(
    results_pmw_sidetpc,
    skip_methods=do_not_plot,
        xlabel='PMW Frequency Threshold',
)


Look at the results in a table...

In [None]:
import pandas as pd


def create_results_df(
    results_maintpc, results_pmw_maintpc, min_thresholds=[1, 3, 5], pmw_value=10000
):
    columns = pd.MultiIndex.from_product([
        [f'min {thresh}' for thresh in min_thresholds] + [f'pmw {pmw_value}'],
        ['precision', 'recall', 'f1-score']
    ])
    df = pd.DataFrame(columns=columns)

    for method, evals in results_maintpc.items():
        row_data = []

        for thresh in min_thresholds:
            row_data.extend([
                evals[thresh]['precision'],
                evals[thresh]['recall'],
                evals[thresh]['f1-score']
            ])

        row_data.extend([
            results_pmw_maintpc[method][pmw_value]['precision'],
            results_pmw_maintpc[method][pmw_value]['recall'],
            results_pmw_maintpc[method][pmw_value]['f1-score']
        ])

        df.loc[method] = row_data

    return df


df = create_results_df(
    results_maintpc,
    results_pmw_maintpc,
    min_thresholds=[1, 5],
    pmw_value=9000
)
df = df.round(3)

row_order = [
    'Baseline', 'Subjective (1)', 'Subjective (2)',
    'coll-5-logdice', 'coll-5-npmi', 'coll-Absatz-logdice', 'coll-Absatz-npmi',
    'log_likelihood_rayson', 'log_likelihood_rayson_BT2', 'odds_ratio', 'odds_ratio_BT2',
    'rqtr', 'rqtr_BT2',
    'LLM-Regex'
]
# Sort the DataFrame according to row_order
df = df.reindex(row_order)
df

We can also copy and paste them when they are comma-separated (e.g. in Excel, Google Sheets).

In [None]:
flat_df = df.copy()
flat_df.columns = [f"{col[0]}_{col[1]}" for col in df.columns]
flat_df = flat_df.reset_index().rename(columns={'index': 'method'})

print(flat_df.to_csv(index=False), )