In [3]:
###demonstration notebook for the article "Computing Danish Bible Translations: A Stylometric Analysis of Bibelen 2020 Compared with DO92"


In [None]:
#Load in the texts

import pandas as pd

from glob import glob

books = []


for book in glob("../data1_nopunct/books/*/*/*.txt"):
    book_type = book.split("/")[-2].strip()
    book_translator = book.split("/")[-3].strip()
    book_name = book.split("/")[-1].split(".")[0].replace(book_translator, "").strip()
    with open(book, "r") as f:
        book_content = f.read().replace("\n", " ").strip()
    books.append({
        "type": book_type,
        "name": book_name,
        "translator": book_translator,
        "content": book_content
    })



In [None]:
##Length and Vocabulary Analyses

In [None]:
## Total token count for Table 2



book_df = pd.DataFrame(books)

# Dataframe contains the number of tokens per book
book_df = book_df.assign(count_tokens = pd.DataFrame(books).content.apply(lambda x: len(x.split(" "))))

# Sum the total number of tokens per translator
print("===== 1992 total number of tokens =====")
print(book_df[book_df.translator == "1992"].count_tokens.sum())
print("===== 2020 total number of tokens =====")
print(book_df[book_df.translator == "2020"].count_tokens.sum())

# Difference in number of tokens per book
# If the value is positive, the 1992 translation has more tokens than the 2020 translation
# If the value is negative, the 2020 translation has more tokens than the 1992 translation
diff_tokens = pd.DataFrame(book_df[book_df.translator == "1992"].sort_values("name").count_tokens.values - book_df[book_df.translator == "2020"].sort_values("name").count_tokens.values)
diff_tokens = diff_tokens.assign(name = book_df[book_df.translator == "1992"].sort_values("name").name.values)
diff_tokens.columns = ["1992 translation - 2020 translation", "name"]

In [None]:
## Token counts for Table 3 and NT/OT Token counts in Table 2





OT1992 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="1992")]["token_count"].sum()
NT1992 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="1992")]["token_count"].sum()
OT2020 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="2020")]["token_count"].sum()
NT2020 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="2020")]["token_count"].sum()

print("OT1992")
print("NT1992")
print("OT2020")
print("NT2020")
print(OT1992)
print(NT1992)
print(OT2020)
print(NT2020)

token_pivot = book_df.pivot(index="name", columns="translator", values="token_count")
token_pivot["diff_1992_minus_2020"] = token_pivot["1992"] - token_pivot["2020"]
token_pivot["diff_percent_1992_vs_2020"] = 100 * token_pivot["diff_1992_minus_2020"] / token_pivot["2020"]
token_pivot_sorted = token_pivot.sort_values(by="diff_1992_minus_2020", ascending=False)
print(token_pivot_sorted)
token_pivot_sorted.to_csv("token_counts_sorted.csv")




In [None]:
## Vocabulary Richness

In [None]:
import spacy

book_df = pd.DataFrame(books)

# Perform the lemmatization using Danish Spacy model
# First, you need to install the large model of Danish Spacy, running
# python -m spacy download da_core_news_lg
# in the terminal

nlp = spacy.load("da_core_news_lg")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

book_df["lemmatized_content"] = book_df["content"].apply(lemmatize_text)

In [None]:
## Unique number of lemmas per book

In [None]:
book_df = book_df.assign(count_lemmas = book_df.lemmatized_content.apply(lambda x: len(set(x.split(" ")))))
print(book_df[["name", "translator", "count_lemmas"]])

In [None]:
## Calculation for Table 5
# Unique number of lemmas across all the books
print("===== 1992 total number of lemmas =====")
print(len(set(" ".join(book_df[book_df.translator == "1992"].lemmatized_content).split(" "))))

print("===== 2020 total number of lemmas =====")
print(len(set(" ".join(book_df[book_df.translator == "2020"].lemmatized_content).split(" "))))

print("===")
print("Difference in unique lemmas between 1992 and 2020 over all the bible")
print(len(set(" ".join(book_df[book_df.translator == "1992"].lemmatized_content).split(" "))) - len(set(" ".join(book_df[book_df.translator == "2020"].lemmatized_content).split(" "))))

In [None]:
def all_unique_lemmas(series):
    # Collects all lemmas across the books
    lemma_set = set()
    for text in series:
        for lemma in text.split():
            lemma_set.add(lemma.lower())
    return len(lemma_set)

groups = {
    "GT1992":  book_df.query('type == "GT" and translator == "1992"')["lemmatized_content"],
    "NT1992":  book_df.query('type == "NT" and translator == "1992"')["lemmatized_content"],
    "GT2020":  book_df.query('type == "GT" and translator == "2020"')["lemmatized_content"],
    "NT2020":  book_df.query('type == "NT" and translator == "2020"')["lemmatized_content"],
}

for k, s in groups.items():
    print(f"{k}: {all_unique_lemmas(s)}")

In [None]:
def all_unique_lemmas(series):
    # Collects all lemmas across the books
    lemma_set = set()
    for text in series:
        for lemma in text.split():
            lemma_set.add(lemma.lower())
    return len(lemma_set)

groups = {
    "GT1992":  book_df.query('type == "GT" and translator == "1992"')["lemmatized_content"],
    "NT1992":  book_df.query('type == "NT" and translator == "1992"')["lemmatized_content"],
    "GT2020":  book_df.query('type == "GT" and translator == "2020"')["lemmatized_content"],
    "NT2020":  book_df.query('type == "NT" and translator == "2020"')["lemmatized_content"],
}

for k, s in groups.items():
    print(f"{k}: {all_unique_lemmas(s)}")

In [None]:
#calculations for Table 6 and Table 7



OT1992 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="1992")]["count_lemmas"].sum()
NT1992 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="1992")]["count_lemmas"].sum()
OT2020 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="2020")]["count_lemmas"].sum()
NT2020 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="2020")]["count_lemmas"].sum()

print("OT1992")
print("NT1992")
print("OT2020")
print("NT2020")
print(OT1992)
print(NT1992)
print(OT2020)
print(NT2020)

token_pivot = book_df.pivot(index="name", columns="translator", values="count_lemmas")
token_pivot["diff_1992_minus_2020"] = token_pivot["1992"] - token_pivot["2020"]
token_pivot["diff_percent_1992_vs_2020"] = 100 * token_pivot["diff_1992_minus_2020"] / token_pivot["2020"]
token_pivot_sorted = token_pivot.sort_values(by="diff_1992_minus_2020", ascending=False)
# Rund procentkolonnen til 2 decimaler
token_pivot["diff_percent_1992_vs_2020"] = token_pivot["diff_percent_1992_vs_2020"].round(2)
print(token_pivot_sorted)
token_pivot_sorted.to_csv("lemma_counts_sorted.csv")



In [None]:
token_pivot_sorted_perc = token_pivot.sort_values(by="diff_percent_1992_vs_2020", ascending=False)
print(token_pivot_sorted_perc)
token_pivot_sorted_perc.to_csv("lemma_counts_sorted_perc.csv")