In [3]:
###demonstration notebook for the article "Computing Danish Bible Translations: A Stylometric Analysis of Bibelen 2020 Compared with DO92"


In [6]:
#Load in the texts

import pandas as pd

from glob import glob

books = []


for book in glob("./data1_nopunct/books/*/*/*.txt"):
    book_type = book.split("/")[-2].strip()
    book_translator = book.split("/")[-3].strip()
    book_name = book.split("/")[-1].split(".")[0].replace(book_translator, "").strip()
    with open(book, "r") as f:
        book_content = f.read().replace("\n", " ").strip()
    books.append({
        "type": book_type,
        "name": book_name,
        "translator": book_translator,
        "content": book_content
    })



In [None]:
##Length and Vocabulary Analyses

In [7]:
## Total token count for Table 2



book_df = pd.DataFrame(books)

# Dataframe contains the number of tokens per book
book_df = book_df.assign(count_tokens = pd.DataFrame(books).content.apply(lambda x: len(x.split(" "))))

# Sum the total number of tokens per translator
print("===== 1992 total number of tokens =====")
print(book_df[book_df.translator == "1992"].count_tokens.sum())
print("===== 2020 total number of tokens =====")
print(book_df[book_df.translator == "2020"].count_tokens.sum())

# Difference in number of tokens per book
# If the value is positive, the 1992 translation has more tokens than the 2020 translation
# If the value is negative, the 2020 translation has more tokens than the 1992 translation
diff_tokens = pd.DataFrame(book_df[book_df.translator == "1992"].sort_values("name").count_tokens.values - book_df[book_df.translator == "2020"].sort_values("name").count_tokens.values)
diff_tokens = diff_tokens.assign(name = book_df[book_df.translator == "1992"].sort_values("name").name.values)
diff_tokens.columns = ["1992 translation - 2020 translation", "name"]

===== 1992 total number of tokens =====
665987
===== 2020 total number of tokens =====
652060


In [None]:
## Token counts for Table 3 and NT/OT Token counts in Table 2

book_df["token_count"] = book_df["content"].apply(lambda x: len(x.split()))

In [None]:
OT1992 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="1992")]["token_count"].sum()
NT1992 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="1992")]["token_count"].sum()
OT2020 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="2020")]["token_count"].sum()
NT2020 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="2020")]["token_count"].sum()

print("OT1992")
print("NT1992")
print("OT2020")
print("NT2020")
print(OT1992)
print(NT1992)
print(OT2020)
print(NT2020)

token_pivot = book_df.pivot(index="name", columns="translator", values="token_count")
token_pivot["diff_1992_minus_2020"] = token_pivot["1992"] - token_pivot["2020"]
token_pivot["diff_percent_1992_vs_2020"] = 100 * token_pivot["diff_1992_minus_2020"] / token_pivot["2020"]
token_pivot_sorted = token_pivot.sort_values(by="diff_1992_minus_2020", ascending=False)
print(token_pivot_sorted)
token_pivot_sorted.to_csv("token_counts_sorted.csv")




OT1992
NT1992
OT2020
NT2020
493479
169998
463743
188232
translator   1992   2020  diff_1992_minus_2020  diff_percent_1992_vs_2020
name                                                                     
26_Ezek     30664  24956                  5708                  22.872255
03_Lev      19765  15124                  4641                  30.686326
04_Num      22849  18252                  4597                  25.186281
02_Exod     26617  22641                  3976                  17.561062
05_Deut     26694  23245                  3449                  14.837599
...           ...    ...                   ...                        ...
18_Job      15717  17647                 -1930                 -10.936703
45_Rom       8941  10893                 -1952                 -17.919765
42_Luke     23962  25941                 -1979                  -7.628850
44_Acts     22513  24886                 -2373                  -9.535482
23_Isa      29399  32268                 -2869          

In [None]:
## Vocabulary Richness


In [None]:
# Perform the lemmatization using Danish Spacy model
# First, you need to install the large model of Danish Spacy, running
# python -m spacy download da_core_news_lg
# in the terminal
#restart the notebook after installing the model
# Run the notebook from the cell below to continue

In [3]:
import spacy
import pandas as pd
from glob import glob

books = []


for book in glob("./data1_nopunct/books/*/*/*.txt"):
    book_type = book.split("/")[-2].strip()
    book_translator = book.split("/")[-3].strip()
    book_name = book.split("/")[-1].split(".")[0].replace(book_translator, "").strip()
    with open(book, "r") as f:
        book_content = f.read().replace("\n", " ").strip()
    books.append({
        "type": book_type,
        "name": book_name,
        "translator": book_translator,
        "content": book_content
    })

book_df = pd.DataFrame(books)



nlp = spacy.load("da_core_news_lg")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

book_df["lemmatized_content"] = book_df["content"].apply(lemmatize_text)

In [None]:
## Unique number of lemmas per book

In [4]:
book_df = book_df.assign(count_lemmas = book_df.lemmatized_content.apply(lambda x: len(set(x.split(" ")))))
print(book_df[["name", "translator", "count_lemmas"]])

         name translator  count_lemmas
0     21_Eccl       1992           820
1     26_Ezek       1992          2474
2     12_2Kgs       1992          1982
3      04_Num       1992          2085
4      39_Mal       1992           419
..        ...        ...           ...
127  63_2John       2020           135
128   41_Mark       2020          1509
129   55_2Tim       2020           506
130  62_1John       2020           344
131   57_Phlm       2020           186

[132 rows x 3 columns]


In [5]:
## Calculation for Table 5
# Unique number of lemmas across all the books
print("===== 1992 total number of lemmas =====")
print(len(set(" ".join(book_df[book_df.translator == "1992"].lemmatized_content).split(" "))))

print("===== 2020 total number of lemmas =====")
print(len(set(" ".join(book_df[book_df.translator == "2020"].lemmatized_content).split(" "))))

print("===")
print("Difference in unique lemmas between 1992 and 2020 over all the bible")
print(len(set(" ".join(book_df[book_df.translator == "1992"].lemmatized_content).split(" "))) - len(set(" ".join(book_df[book_df.translator == "2020"].lemmatized_content).split(" "))))

===== 1992 total number of lemmas =====
16194
===== 2020 total number of lemmas =====
14329
===
Difference in unique lemmas between 1992 and 2020 over all the bible
1865


In [None]:
#word types OT1992, NT1992, OT2020, NT2020
def all_unique_lemmas(series):
    # Collects all lemmas across the books
    lemma_set = set()
    for text in series:
        for lemma in text.split():
            lemma_set.add(lemma.lower())
    return len(lemma_set)

groups = {
    "GT1992":  book_df.query('type == "GT" and translator == "1992"')["lemmatized_content"],
    "NT1992":  book_df.query('type == "NT" and translator == "1992"')["lemmatized_content"],
    "GT2020":  book_df.query('type == "GT" and translator == "2020"')["lemmatized_content"],
    "NT2020":  book_df.query('type == "NT" and translator == "2020"')["lemmatized_content"],
}

for k, s in groups.items():
    print(f"{k}: {all_unique_lemmas(s)}")

GT1992: 13668
NT1992: 6274
GT2020: 12507
NT2020: 5503


In [8]:
#calculations for Table 6 and Table 7



OT1992 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="1992")]["count_lemmas"].sum()
NT1992 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="1992")]["count_lemmas"].sum()
OT2020 = book_df[(book_df["type"]=="GT") & (book_df["translator"]=="2020")]["count_lemmas"].sum()
NT2020 = book_df[(book_df["type"]=="NT") & (book_df["translator"]=="2020")]["count_lemmas"].sum()

print("OT1992")
print("NT1992")
print("OT2020")
print("NT2020")
print(OT1992)
print(NT1992)
print(OT2020)
print(NT2020)

token_pivot = book_df.pivot(index="name", columns="translator", values="count_lemmas")
token_pivot["diff_1992_minus_2020"] = token_pivot["1992"] - token_pivot["2020"]
token_pivot["diff_percent_1992_vs_2020"] = 100 * token_pivot["diff_1992_minus_2020"] / token_pivot["2020"]
token_pivot_sorted = token_pivot.sort_values(by="diff_1992_minus_2020", ascending=False)
# Rund procentkolonnen til 2 decimaler
token_pivot["diff_percent_1992_vs_2020"] = token_pivot["diff_percent_1992_vs_2020"].round(2)
print(token_pivot_sorted)
token_pivot_sorted.to_csv("lemma_counts_sorted.csv")



OT1992
NT1992
OT2020
NT2020
55813
22389
53296
21952
translator  1992  2020  diff_1992_minus_2020  diff_percent_1992_vs_2020
name                                                                   
05_Deut     2310  1985                   325                  16.372796
18_Job      2156  1897                   259                  13.653137
10_2Sam     1947  1743                   204                  11.703959
44_Acts     2260  2068                   192                   9.284333
11_1Kgs     2090  1900                   190                  10.000000
...          ...   ...                   ...                        ...
32_Jonah     310   361                   -51                 -14.127424
07_Judg     1760  1833                   -73                  -3.982542
66_Rev      1221  1310                   -89                  -6.793893
28_Hos       904   996                   -92                  -9.236948
17_Esth      758   855                   -97                 -11.345029

[66 rows x 

In [None]:
token_pivot_sorted_perc = token_pivot.sort_values(by="diff_percent_1992_vs_2020", ascending=False)
print(token_pivot_sorted_perc)
token_pivot_sorted_perc.to_csv("lemma_counts_sorted_perc.csv")