## 🎓 Library

In [1]:
from googletrans import Translator
from tqdm import tqdm
import pandas as pd

DATA = "data/"


## Initial setup

We start with 71570 poems

In [2]:
poems_df = pd.read_parquet(DATA + "de_poems.parquet")

In [3]:
poems_df.head(3)["text"]

0    Gebohrn, und wiederumb, o Mensch, gebohren wer...
1    Es suchte niemand Gott, liegt er gleich unterm...
2    Die ist in Gott, und Gott der ist zugleich in ...
Name: text, dtype: object

In [4]:
new_poems_df = poems_df.copy()

In [58]:
MAX_CHARS = 8500
SEPARATOR = "\t"

async def translate_df(df: pd.DataFrame, src: str = "de", dest: str = "en") -> pd.DataFrame:
    """
    Translates the text of a DataFrame from src to dst language.

    Args:
        df (pd.DataFrame): DataFrame containing the text to be translated.
        src (str): Source language code.
        dest (str): Destination language code.

    Returns:
        pd.DataFrame: DataFrame with translated text.
    """
    translator = Translator()
    translated_rows = []

    buffer = []
    buffer_len = 0
    indices = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Translating", colour="green"):
        text = row["text"]
        text_len = len(text) + len(SEPARATOR)

        # If this text would overflow the buffer, translate current batch first
        if buffer_len + text_len > MAX_CHARS:
            if buffer:  # prevent empty batch
                joined_text = SEPARATOR.join(buffer)
                translated = (await translator.translate(joined_text, src=src, dest=dest)).text
                split_texts = translated.split(SEPARATOR)

                for i, translated_text in zip(indices, split_texts):
                    translated_rows.append({
                        "title": df.at[i, "title"],
                        "text": translated_text,
                        "author": df.at[i, "author"],
                        "creation": df.at[i, "creation"]
                    })

            # Reset buffer
            buffer = []
            indices = []
            buffer_len = 0

        # If single poem is still too long, skip it (optional safety)
        if text_len > MAX_CHARS:
            print(f"Skipping poem {idx} (too long: {text_len} chars)")
            continue

        # Add to buffer
        buffer.append(text)
        indices.append(idx)
        buffer_len += text_len

    # Handle final batch
    if buffer:
        joined_text = SEPARATOR.join(buffer)
        translated = (await translator.translate(joined_text, src=src, dest=dest)).text
        split_texts = translated.split(SEPARATOR)

        for i, translated_text in zip(indices, split_texts):
            translated_rows.append({
                "title": df.at[i, "title"],
                "text": translated_text,
                "author": df.at[i, "author"],
                "creation": df.at[i, "creation"]
            })

    return pd.DataFrame(translated_rows)

# Data merge (Generation)

Merge two datasets into one. This requires the translation of one of the datasets into the language of the other dataset.


## German to English

English poems do not have a creation date, but this will not matter for the generation training.

In [24]:
english_poems = pd.read_parquet(DATA + "en_poems.parquet")

In [66]:
sample = poems_df.head(10000)
de_translated = await translate_df(sample, src="de", dest="en")

Translating:   0%|[32m          [0m| 0/10000 [00:00<?, ?it/s]

Translating:   2%|[32m▏         [0m| 164/10000 [00:01<01:13, 134.10it/s]

Skipping poem 157 (too long: 23508 chars)


Translating:   9%|[32m▉         [0m| 885/10000 [00:12<02:37, 57.79it/s] 

Skipping poem 871 (too long: 14655 chars)


Translating:  14%|[32m█▍        [0m| 1409/10000 [00:36<05:24, 26.46it/s]

Skipping poem 1408 (too long: 8803 chars)


Translating:  14%|[32m█▍        [0m| 1413/10000 [00:37<10:12, 14.03it/s]

Skipping poem 1411 (too long: 10370 chars)


Translating:  14%|[32m█▍        [0m| 1424/10000 [00:38<08:03, 17.74it/s]

Skipping poem 1423 (too long: 9714 chars)


Translating:  15%|[32m█▍        [0m| 1470/10000 [00:43<16:15,  8.75it/s]

Skipping poem 1468 (too long: 17859 chars)


Translating:  15%|[32m█▍        [0m| 1487/10000 [00:46<20:07,  7.05it/s]

Skipping poem 1486 (too long: 9083 chars)


Translating:  15%|[32m█▌        [0m| 1505/10000 [00:47<07:58, 17.76it/s]

Skipping poem 1498 (too long: 20027 chars)


Translating:  15%|[32m█▌        [0m| 1539/10000 [00:52<19:57,  7.07it/s]

Skipping poem 1537 (too long: 14440 chars)


Translating:  16%|[32m█▌        [0m| 1557/10000 [00:53<13:01, 10.80it/s]

Skipping poem 1555 (too long: 12715 chars)


Translating:  16%|[32m█▌        [0m| 1595/10000 [00:55<06:47, 20.64it/s]

Skipping poem 1589 (too long: 8650 chars)
Skipping poem 1590 (too long: 8581 chars)


Translating:  16%|[32m█▌        [0m| 1619/10000 [00:56<05:59, 23.31it/s]

Skipping poem 1617 (too long: 17574 chars)


Translating:  17%|[32m█▋        [0m| 1651/10000 [00:59<11:46, 11.81it/s]

Skipping poem 1650 (too long: 12667 chars)


Translating:  17%|[32m█▋        [0m| 1677/10000 [01:03<32:00,  4.33it/s]

Skipping poem 1675 (too long: 8971 chars)


Translating:  18%|[32m█▊        [0m| 1753/10000 [01:12<14:44,  9.33it/s]

Skipping poem 1748 (too long: 11759 chars)


Translating:  18%|[32m█▊        [0m| 1755/10000 [01:12<16:53,  8.13it/s]

Skipping poem 1753 (too long: 14541 chars)


Translating:  18%|[32m█▊        [0m| 1765/10000 [01:13<13:30, 10.17it/s]

Skipping poem 1764 (too long: 25584 chars)


Translating:  18%|[32m█▊        [0m| 1773/10000 [01:15<27:45,  4.94it/s]

Skipping poem 1771 (too long: 9041 chars)


Translating:  18%|[32m█▊        [0m| 1787/10000 [01:16<16:41,  8.20it/s]

Skipping poem 1786 (too long: 13611 chars)


Translating:  18%|[32m█▊        [0m| 1805/10000 [01:18<16:37,  8.21it/s]

Skipping poem 1801 (too long: 8773 chars)
Skipping poem 1804 (too long: 16589 chars)


Translating:  18%|[32m█▊        [0m| 1826/10000 [01:19<08:45, 15.55it/s]

Skipping poem 1822 (too long: 12006 chars)
Skipping poem 1825 (too long: 11833 chars)
Skipping poem 1826 (too long: 25068 chars)


Translating:  18%|[32m█▊        [0m| 1829/10000 [01:20<08:22, 16.27it/s]

Skipping poem 1828 (too long: 13839 chars)


Translating:  18%|[32m█▊        [0m| 1835/10000 [01:20<11:15, 12.08it/s]

Skipping poem 1835 (too long: 9873 chars)


Translating:  19%|[32m█▊        [0m| 1867/10000 [01:23<08:42, 15.56it/s]

Skipping poem 1866 (too long: 16469 chars)
Skipping poem 1867 (too long: 24163 chars)
Skipping poem 1868 (too long: 19512 chars)
Skipping poem 1869 (too long: 23122 chars)
Skipping poem 1870 (too long: 25193 chars)
Skipping poem 1871 (too long: 31001 chars)


Translating:  23%|[32m██▎       [0m| 2288/10000 [01:49<17:38,  7.29it/s]

Skipping poem 2287 (too long: 19369 chars)


Translating:  24%|[32m██▍       [0m| 2376/10000 [01:55<12:29, 10.17it/s]

Skipping poem 2375 (too long: 18343 chars)


Translating:  24%|[32m██▍       [0m| 2449/10000 [02:08<30:40,  4.10it/s]

Skipping poem 2448 (too long: 9671 chars)


Translating:  29%|[32m██▉       [0m| 2891/10000 [02:44<03:36, 32.88it/s]

Skipping poem 2890 (too long: 8741 chars)


Translating:  29%|[32m██▉       [0m| 2933/10000 [02:45<03:51, 30.55it/s]

Skipping poem 2931 (too long: 31111 chars)
Skipping poem 2932 (too long: 34158 chars)
Skipping poem 2933 (too long: 26583 chars)
Skipping poem 2934 (too long: 33443 chars)
Skipping poem 2935 (too long: 32220 chars)
Skipping poem 2936 (too long: 39776 chars)
Skipping poem 2937 (too long: 42972 chars)
Skipping poem 2938 (too long: 29773 chars)
Skipping poem 2939 (too long: 35199 chars)
Skipping poem 2940 (too long: 27302 chars)
Skipping poem 2941 (too long: 30130 chars)
Skipping poem 2942 (too long: 37159 chars)
Skipping poem 2943 (too long: 28102 chars)
Skipping poem 2944 (too long: 27473 chars)
Skipping poem 2945 (too long: 22787 chars)
Skipping poem 2946 (too long: 26306 chars)
Skipping poem 2947 (too long: 33550 chars)
Skipping poem 2948 (too long: 36764 chars)
Skipping poem 2949 (too long: 46349 chars)
Skipping poem 2950 (too long: 50908 chars)
Skipping poem 2951 (too long: 17665 chars)


Translating:  30%|[32m██▉       [0m| 2955/10000 [02:45<02:36, 45.03it/s]

Skipping poem 2955 (too long: 12926 chars)


Translating:  30%|[32m██▉       [0m| 2960/10000 [02:47<07:04, 16.57it/s]

Skipping poem 2959 (too long: 9746 chars)
Skipping poem 2960 (too long: 10319 chars)


Translating:  30%|[32m██▉       [0m| 2977/10000 [02:49<15:55,  7.35it/s]

Skipping poem 2973 (too long: 13660 chars)


Translating:  30%|[32m██▉       [0m| 2979/10000 [02:50<16:51,  6.94it/s]

Skipping poem 2978 (too long: 14142 chars)
Skipping poem 2979 (too long: 13231 chars)


Translating:  30%|[32m██▉       [0m| 2991/10000 [02:51<13:07,  8.90it/s]

Skipping poem 2989 (too long: 12435 chars)


Translating:  30%|[32m██▉       [0m| 2999/10000 [02:52<15:15,  7.65it/s]

Skipping poem 2998 (too long: 8733 chars)


Translating:  30%|[32m███       [0m| 3006/10000 [02:53<29:21,  3.97it/s]

Skipping poem 3005 (too long: 11038 chars)


Translating:  30%|[32m███       [0m| 3013/10000 [02:54<21:57,  5.30it/s]

Skipping poem 3012 (too long: 10681 chars)


Translating:  30%|[32m███       [0m| 3037/10000 [03:00<11:38,  9.97it/s]

Skipping poem 3031 (too long: 19236 chars)


Translating:  30%|[32m███       [0m| 3041/10000 [03:01<10:55, 10.62it/s]

Skipping poem 3037 (too long: 10052 chars)


Translating:  30%|[32m███       [0m| 3045/10000 [03:03<31:29,  3.68it/s]

Skipping poem 3044 (too long: 18153 chars)


Translating:  31%|[32m███       [0m| 3053/10000 [03:05<26:36,  4.35it/s]

Skipping poem 3052 (too long: 10744 chars)


Translating:  31%|[32m███       [0m| 3075/10000 [03:08<19:58,  5.78it/s]

Skipping poem 3072 (too long: 10073 chars)
Skipping poem 3074 (too long: 13574 chars)
Skipping poem 3075 (too long: 19510 chars)


Translating:  31%|[32m███       [0m| 3093/10000 [03:09<09:23, 12.25it/s]

Skipping poem 3088 (too long: 9266 chars)


Translating:  31%|[32m███       [0m| 3098/10000 [03:10<12:27,  9.23it/s]

Skipping poem 3094 (too long: 11568 chars)
Skipping poem 3097 (too long: 9881 chars)


Translating:  31%|[32m███       [0m| 3103/10000 [03:11<11:09, 10.30it/s]

Skipping poem 3100 (too long: 10721 chars)


Translating:  31%|[32m███       [0m| 3108/10000 [03:12<16:08,  7.12it/s]

Skipping poem 3107 (too long: 13255 chars)


Translating:  31%|[32m███       [0m| 3116/10000 [03:12<09:16, 12.38it/s]

Skipping poem 3110 (too long: 11074 chars)
Skipping poem 3115 (too long: 17360 chars)


Translating:  31%|[32m███       [0m| 3123/10000 [03:12<07:52, 14.57it/s]

Skipping poem 3119 (too long: 18101 chars)
Skipping poem 3122 (too long: 8525 chars)


Translating:  31%|[32m███▏      [0m| 3133/10000 [03:13<09:27, 12.11it/s]

Skipping poem 3129 (too long: 8746 chars)


Translating:  31%|[32m███▏      [0m| 3139/10000 [03:14<12:29,  9.16it/s]

Skipping poem 3136 (too long: 17523 chars)
Skipping poem 3138 (too long: 11678 chars)
Skipping poem 3139 (too long: 34525 chars)


Translating:  31%|[32m███▏      [0m| 3145/10000 [03:15<08:45, 13.05it/s]

Skipping poem 3141 (too long: 11904 chars)
Skipping poem 3144 (too long: 9245 chars)


Translating:  32%|[32m███▏      [0m| 3151/10000 [03:15<11:06, 10.28it/s]

Skipping poem 3148 (too long: 14638 chars)


Translating:  34%|[32m███▍      [0m| 3399/10000 [03:28<08:15, 13.33it/s]

Skipping poem 3396 (too long: 13407 chars)


Translating:  34%|[32m███▍      [0m| 3443/10000 [03:34<12:53,  8.47it/s]

Skipping poem 3439 (too long: 9614 chars)


Translating:  35%|[32m███▍      [0m| 3457/10000 [03:35<09:32, 11.42it/s]

Skipping poem 3451 (too long: 9043 chars)


Translating:  35%|[32m███▌      [0m| 3545/10000 [03:45<10:34, 10.17it/s]

Skipping poem 3544 (too long: 12830 chars)
Skipping poem 3545 (too long: 9496 chars)


Translating:  36%|[32m███▌      [0m| 3606/10000 [03:49<06:25, 16.60it/s]

Skipping poem 3605 (too long: 10075 chars)


Translating:  36%|[32m███▋      [0m| 3632/10000 [03:50<03:15, 32.56it/s]

Skipping poem 3631 (too long: 10182 chars)


Translating:  37%|[32m███▋      [0m| 3697/10000 [03:53<04:13, 24.85it/s]

Skipping poem 3696 (too long: 11396 chars)


Translating:  38%|[32m███▊      [0m| 3839/10000 [03:59<03:45, 27.36it/s]

Skipping poem 3835 (too long: 8978 chars)


Translating:  39%|[32m███▉      [0m| 3909/10000 [04:06<09:33, 10.63it/s]

Skipping poem 3902 (too long: 9444 chars)
Skipping poem 3903 (too long: 10810 chars)


Translating:  48%|[32m████▊     [0m| 4848/10000 [04:24<02:10, 39.43it/s]

Skipping poem 4842 (too long: 15144 chars)


Translating:  49%|[32m████▊     [0m| 4868/10000 [04:25<04:51, 17.59it/s]

Skipping poem 4867 (too long: 10494 chars)


Translating:  49%|[32m████▉     [0m| 4882/10000 [04:26<04:18, 19.78it/s]

Skipping poem 4880 (too long: 16012 chars)


Translating:  49%|[32m████▉     [0m| 4894/10000 [04:28<08:24, 10.11it/s]

Skipping poem 4893 (too long: 12289 chars)


Translating:  50%|[32m█████     [0m| 5022/10000 [04:39<15:12,  5.46it/s]

Skipping poem 5021 (too long: 10906 chars)


Translating:  52%|[32m█████▏    [0m| 5211/10000 [04:48<03:58, 20.09it/s]

Skipping poem 5209 (too long: 40546 chars)


Translating:  52%|[32m█████▏    [0m| 5222/10000 [04:50<09:23,  8.48it/s]

Skipping poem 5221 (too long: 26068 chars)


Translating:  57%|[32m█████▋    [0m| 5743/10000 [05:07<06:05, 11.64it/s]

Skipping poem 5742 (too long: 20767 chars)


Translating:  58%|[32m█████▊    [0m| 5754/10000 [05:08<05:08, 13.78it/s]

Skipping poem 5753 (too long: 12891 chars)


Translating:  58%|[32m█████▊    [0m| 5814/10000 [05:13<07:45,  8.98it/s]

Skipping poem 5813 (too long: 11725 chars)


Translating:  59%|[32m█████▊    [0m| 5851/10000 [05:14<02:53, 23.96it/s]

Skipping poem 5850 (too long: 12995 chars)


Translating:  59%|[32m█████▉    [0m| 5896/10000 [05:16<04:04, 16.76it/s]

Skipping poem 5895 (too long: 8844 chars)


Translating:  59%|[32m█████▉    [0m| 5909/10000 [05:17<02:45, 24.71it/s]

Skipping poem 5908 (too long: 18800 chars)


Translating:  59%|[32m█████▉    [0m| 5927/10000 [05:17<02:50, 23.95it/s]

Skipping poem 5926 (too long: 14147 chars)


Translating:  60%|[32m█████▉    [0m| 5999/10000 [05:22<03:07, 21.37it/s]

Skipping poem 5998 (too long: 8841 chars)


Translating:  70%|[32m███████   [0m| 7037/10000 [06:03<00:47, 62.78it/s] 

Skipping poem 7035 (too long: 8750 chars)


Translating:  88%|[32m████████▊ [0m| 8797/10000 [09:08<01:49, 11.01it/s]

Skipping poem 8796 (too long: 8926 chars)
Skipping poem 8797 (too long: 19904 chars)
Skipping poem 8798 (too long: 10106 chars)
Skipping poem 8799 (too long: 9404 chars)
Skipping poem 8800 (too long: 9038 chars)
Skipping poem 8801 (too long: 16666 chars)
Skipping poem 8802 (too long: 10729 chars)
Skipping poem 8803 (too long: 10003 chars)
Skipping poem 8804 (too long: 9245 chars)
Skipping poem 8805 (too long: 14444 chars)
Skipping poem 8806 (too long: 19683 chars)
Skipping poem 8807 (too long: 15605 chars)
Skipping poem 8808 (too long: 8969 chars)
Skipping poem 8809 (too long: 9054 chars)
Skipping poem 8810 (too long: 8646 chars)
Skipping poem 8811 (too long: 9834 chars)


Translating:  90%|[32m█████████ [0m| 9011/10000 [09:49<02:02,  8.06it/s]

Skipping poem 9010 (too long: 9141 chars)


Translating:  93%|[32m█████████▎[0m| 9263/10000 [10:22<01:59,  6.18it/s]

Skipping poem 9262 (too long: 9416 chars)


Translating:  93%|[32m█████████▎[0m| 9269/10000 [10:23<02:06,  5.79it/s]

Skipping poem 9268 (too long: 12179 chars)


Translating:  93%|[32m█████████▎[0m| 9331/10000 [10:38<02:45,  4.03it/s]

Skipping poem 9330 (too long: 10096 chars)


Translating:  94%|[32m█████████▍[0m| 9392/10000 [10:52<02:20,  4.32it/s]

Skipping poem 9391 (too long: 9233 chars)


Translating:  96%|[32m█████████▌[0m| 9569/10000 [11:52<01:41,  4.26it/s]

Skipping poem 9568 (too long: 15617 chars)


Translating:  96%|[32m█████████▌[0m| 9579/10000 [11:54<01:25,  4.91it/s]

Skipping poem 9578 (too long: 10433 chars)


Translating:  97%|[32m█████████▋[0m| 9687/10000 [12:18<01:08,  4.56it/s]

Skipping poem 9686 (too long: 13875 chars)


Translating:  97%|[32m█████████▋[0m| 9692/10000 [12:18<01:00,  5.13it/s]

Skipping poem 9691 (too long: 14070 chars)


Translating: 100%|[32m██████████[0m| 10000/10000 [13:26<00:00, 12.40it/s]


In [71]:
sample["text"].iloc[0]

'Gebohrn, und wiederumb, o Mensch, gebohren werden,\nErrettet dich vom Tod und hilfft dir in Beschwerden.'

In [70]:
de_translated["text"][0]

'Drum, and again, o human, being born,\nSalvents you from death and help you in symptoms.'

### Saving

In [None]:
new_poems_df = pd.concat([english_poems, de_translated], ignore_index=True)

In [76]:
new_poems_df.to_parquet(DATA + "en_de.parquet", index=False)

# Data augmentation (Classification)

Transform the original dataset with new samples generated through different methods. Most libraries like textattack, nlpaug, are not designed for german sentences.

Source: [Text data augmentations: Permutation, antonyms and negation
](https://www.sciencedirect.com/science/article/abs/pii/S0957417421002104)

## Line permutation

Check the file `classification/utils.py` for the function `data_augment`. This function shuffles the lines of poems in the same century.


The model `classification/logistic_regression/tfidf.ipynb` can be trained with the augmented data.

## Translation to english, textattack augment, and back to german

Augmenting with textattack changes proper names and quantities.

In [None]:
translator = Translator()

In [None]:
transformed_poems = []
for index, row in new_poems_df.iterrows():
    translated = await translator.translate(row["text"], src="de", dest="en")
    translated_row = {
        "title": row["title"],
        "text": translated.text,
        "author": row["author"],
        "creation": row["creation"]
    }
    transformed_poems.append(translated_row)

translated_df = pd.DataFrame(transformed_poems)

## Germanet synonyms

GermaNet is a semantic network for german, similar to WordNet. It contains information about the meaning of words and their relationships to each other. It can be used to find semantic relations between words, such as synonyms, and antonyms.

Sadly the dataset is not open source and needs to have an authorization to use it.