In [None]:
from src.corpus import Corpus, FrequencyCorpus
from src.metrics import keyness
from src.corpus_creation import document_retriever as dr
import pathlib
import json
import pandas as pd
import random
from src.load_data import load_files, load_reference_sample

%load_ext autoreload
%autoreload 2

In [None]:
# Put the path to the directory containing the corpus files here
CORPUSDIR = '/home/brunobrocai/Data/MoWiKo/Paper-themKorp/full'
REFERENCE_CORPUS = '/home/brunobrocai/Data/Reference/Leipzig-Corpora/reference_corpus_20-25.json'

docs, metadata = load_files(CORPUSDIR)
corpus = Corpus(docs, metadata)

reference_docs = load_reference_sample(REFERENCE_CORPUS)
reference_corpus = FrequencyCorpus(reference_docs)

In [None]:
SEARCH_TERMS = ['KI', ('künstlich', 'Intelligenz')]

# Find the documents that contain the search terms (at least min times)
hits = dr.match_wordlist(
    corpus, SEARCH_TERMS, min=1
)

# Load the found documents into a new corpus
study_corpus = dr.corpus_from_found(
    hits, source_corpus=corpus,
    goal_corpus=FrequencyCorpus
)

In [None]:
# create a keyword list for ngrams of length 1 and 2

keynesses = keyness.keyword_list(
    study_corpus, reference_corpus,
    metric='percent_difference',
    min_docs=3,
    smoothing=0.5,
    max_ngram_len=2,
    filter_stopwords=True,
)

In [None]:
from copy import deepcopy

corpus_copy = deepcopy(corpus)

# Treating 'künstlich Intelligenz' as one token
corpus.treat_as_one(['künstlich', 'Intelligenz'], 'künstlich_Intelligenz')

# Picking base terms
base_terms = ('künstlich_Intelligenz', 'KI')

In [None]:
from src.metrics import rqtr_lemma

b, core_term =rqtr_lemma.qtr_baseline(
    base_terms[0], base_terms[1], corpus
)

In [None]:
values = rqtr_lemma.rqtr_list(
    base_terms,
    corpus.documents,
    min_count=1,
    max_ngram_len=2
)

In [None]:
from src.token_util import contains_alphab_tuple

# Sort values by rqtrn
sorted_values = sorted(
    values,
    key=lambda x: x.rqtrn(b),
    reverse=True
)

# Create a pandas dataframe
rqtrs = pd.DataFrame(
    [
        (value.term, value.term_count, value.rqtrn(b))
        for value in sorted_values
    ],
    columns=['Word', 'count', 'rqtrn']
)

# Keep ony values that contain alphabetic characters
rqtrs = rqtrs[rqtrs['Word'].apply(contains_alphab_tuple)]

In [None]:
# Create a new dataframe with both keyness and rqtrn
# Use the value / Word ro

combined_df = pd.merge(
    rqtrs,
    keynesses,
    on='Word',
    how='outer'
)

In [None]:
# drop nan
combined_df = combined_df.dropna()
combined_df = combined_df[
    (combined_df['Keyness'] > 0) &
    (combined_df['rqtrn'] > 0)
]
combined_df