In [1]:
from transformers import pipeline, AutoTokenizer
import pandas as pd
import json
from random import Random
from torch.utils.data import Dataset
from tqdm import tqdm
import transformers
import traceback
from Annotation_forced import do_annotation
from copy import deepcopy
import itertools
from itertools import product

transformers.logging.set_verbosity_error()

ITEMS_PER_CONDITION = 1



In [15]:
class PromptDataset(Dataset):
    def __init__(self, prompts):
        self.prompts = prompts
    def __len__(self):
        return len(self.prompts)
    def __getitem__(self, idx):
        return self.prompts[idx]
        
models = [
    # model name, batch_size, device, device Mapping, is_sentencepiece
    ("stefan-it/german-gpt2-larger", 1, 0, None, False),
]

with open("../items/names.json", encoding="utf-8") as nfile:
    namedict = json.load(nfile)
male_names = [name for name in namedict["male"]]
female_names = [name for name in namedict["female"]]
with open("../items/verbs_forced_reference.json", encoding="utf-8") as nfile:
    verbdict = json.load(nfile)
es_verbs = verbdict["es"]
se_verbs = verbdict["se"]

male_pairing = list(product(male_names, female_names, [False]))
female_pairing = list(product(female_names, male_names, [True]))
Random(42).shuffle(male_pairing)
Random(84).shuffle(female_pairing)

conditions = [
    (2,  es_verbs, female_pairing, "NP1"),
    # (3,  es_verbs,   male_pairing, "NP1"),
    # (6,  se_verbs, female_pairing, "NP1"),
    # (7,  se_verbs,   male_pairing, "NP1"),
    # (10, es_verbs, female_pairing, "NP2"),
    # (11, es_verbs,   male_pairing, "NP2"),
    # (14, se_verbs, female_pairing, "NP2"),
    # (15, se_verbs,   male_pairing, "NP2")
]

In [3]:

items_per_condition = []
  
for condition, verbs, pairing, forced_reference in conditions:
    rows = []
    for verbdict in verbs:
        verb, filler, verbclass = verbdict["verb"], verbdict["filler"], verbdict["verbclass"]
        for np1, np2, female in pairing:
            prompt = f"{np1} {verb} {np2}{filler}, weil"
            nrow = {"condition": condition, "type": "Experiment", "prompt": prompt, "NP1": np1, "NP2": np2, 
                    "NP1gender": "f" if female else "m", "verb": verb, "verbclass": verbclass, "forced": forced_reference}
            rows.append(nrow)
    Random(168).shuffle(rows)
    items_per_condition.append(rows)

In [4]:
def make_constraint_function(tokenizer, female, is_sentencepiece):
    weil = tokenizer.encode(", weil")[-2:]
    names = female_names if female else male_names
    if is_sentencepiece:
        pronouns = ["sie", "diese", "jense"] if female else ["er", "dieser", "jener"]
        tokens = list(map(tokenizer.encode, names + pronouns))
        tokens = [toks[1:] for toks in tokens]
    else:
        pronouns = [" sie", " diese", " jense"] if female else [" er", " dieser", " jener"]
        tokens = list(map(tokenizer.encode, list(map(lambda name: " " + name, names)) + pronouns))
    twos = [items for items in tokens if len(items) > 1]
    twos_one = [item[0] for item in twos]
    twos_two = [item[1] for item in twos]
    all_tokens = list(range(tokenizer.vocab_size))
    tokens = [item[0] for item in tokens]
    def constrainer(batch_id, input_tokens):
        if (input_tokens[-2] == weil[-2]) and (input_tokens[-1] == weil[-1]):
            return tokens
        elif (input_tokens[-3] == weil[-2]) and (input_tokens[-2] == weil[-1]) and (input_tokens[-1] in twos_one):
            return [twos_two[twos_one.index(input_tokens[-1])]]
        else:
            return all_tokens
    return constrainer

In [20]:
for model_name, batch_size, device, device_map, is_sentencepiece in models:
    # print(f"now loading: {model_name}")
    # model = pipeline("text-generation", model = model_name)
    # model.tokenizer.pad_token_id = model.model.config.eos_token_id
    # model.tokenizer.padding_side = "left"    
    data = []
    for condition in items_per_condition:
        items = deepcopy(condition)
        constraint_function = make_constraint_function(model.tokenizer, False, is_sentencepiece)
        if ((condition[0]["forced"] == "NP1") and (condition[0]["NP1gender"] == "f")) or ((condition[0]["forced"] == "NP2") and (condition[0]["NP1gender"] == "m")):
            constraint_function = make_constraint_function(model.tokenizer, True, is_sentencepiece)
        bar = tqdm(total = ITEMS_PER_CONDITION)
        bar.set_description(f"Condition {items[0]['condition']}")
        result = []
        counter = 0
        while bar.n < ITEMS_PER_CONDITION:
            counter += 1
            if len(items) >= batch_size:
                rows = pd.DataFrame(items[:batch_size])
                items = items[batch_size:]
                prompts = rows["prompt"].to_list()
                continuations = model(prompts, batch_size=batch_size, remove_invalid_values=True, early_stopping = True, prefix_allowed_tokens_fn=constraint_function, do_sample = False, diversity_penalty = .6, num_beam_groups = 5, num_beams = 10, num_return_sequences=10, max_new_tokens = 5)
                cs = continuations
                print(len(continuations))
                continuations = [cont[0]["generated_text"] for cont in continuations]
                continuations = list(map(lambda zipped: zipped[1][len(zipped[0])+1:], zip(prompts, continuations)))
                rows["cont"] = continuations
                res = do_annotation(rows, True)
                res = res[res["Koreferenz"] == res["forced"]]
                result += res.values.tolist()
                bar.update(len(res))
            else:
                print(f"Run out of data in condition {condition[0]['condition']}")
                break
        del bar
        print(f"Generated {counter} sentences for condition {items[0]['condition']}")
        data += result

    exp3 = pd.DataFrame(data, columns = ["condition", "type", "prompt", "NP1", "NP2", "NP1gender", "verb", "verbclass", "forced", "cont", "Koreferenz", "Anaphorische Form"])
    exp3.to_csv(f"../data/forced_coreference--{model_name.replace('/', '--')}.csv", sep=";", index=False)
    
    # del model
    del exp3
    del rows
    del items
    
    # FIX ROW BUG


Condition 3:   0%|                                                                               | 0/1 [02:17<?, ?it/s][A

Condition 2:   0%|                                                                               | 0/1 [00:00<?, ?it/s][A

1



Condition 2: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.16s/it][A


Generated 1 sentences for condition 2


Condition 3:   0%|                                                                               | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [45]:
from transformers import AutoTokenizer, AutoModelWithLMHead
tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2", trun)
# model_model = AutoModelWithLMHead.from_pretrained("dbmdz/german-gpt2")

Downloading:   0%|          | 0.00/865 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/487M [00:00<?, ?B/s]

In [48]:
generation_list = []
toks = tokenizer("Peter hasste Maria, weil", return_tensors="pt")
gens = model_model.generate(**toks, 
                          max_length=10,
                          num_beams=7, 
                          # num_beam_groups=3,
                          #num_beam_groups must be <= num_beams                                     
                          do_sample= False, 
                          # diversity_penalty= .7, 
                          early_stopping=True)
generation_list.append(tokenizer.decode(gens[0], skip_special_tokens=True))

In [47]:
toks

{'input_ids': tensor([[8877,  315, 4565, 4999,   16, 1690]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [21]:
cs

[[{'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie nicht wusste, dass'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie ihn nicht mochte.'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie nicht wusste, was'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie ihn für einen Verräter'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie nicht wusste, dass'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie ihn für einen Lügner'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie sich weigerte, sich'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie sich von ihm nicht'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie ihn für einen Verräter'},
  {'generated_text': 'Lara beneidete Sven, ohne es offen zu zeigen, weil sie nicht wusste, was'}]]

In [20]:
list(map(tok.encode, female_names))

[[2, 6869],
 [2, 26344],
 [2, 46831],
 [2, 30602],
 [2, 26520],
 [2, 78501],
 [2, 141274],
 [2, 72241, 24278],
 [2, 26986],
 [2, 885, 3932],
 [2, 64414],
 [2, 16224],
 [2, 480, 9],
 [2, 61753, 10006],
 [2, 60354],
 [2, 37126],
 [2, 80206],
 [2, 2138, 101],
 [2, 99545],
 [2, 74834],
 [2, 237331, 2202],
 [2, 108027],
 [2, 37961, 95],
 [2, 63209, 186],
 [2, 346, 23252],
 [2, 202555],
 [2, 18232, 9],
 [2, 82856],
 [2, 97354],
 [2, 61646],
 [2, 2578, 1438],
 [2, 238714],
 [2, 166882, 12],
 [2, 33537, 452],
 [2, 38158, 9],
 [2, 207485],
 [2, 192062],
 [2, 83966, 101],
 [2, 69207, 36],
 [2, 103983]]

In [22]:
tok.decode(101)

'na'

In [16]:
tok.encode(", weil")[-2:]

[4, 17675]

In [None]:
def make_constraint_function(tokenizer, female):
    weil = tokenizer.encode(", weil")[-2:]
    names = female_names if female else male_names
    pronouns = [" sie", " diese", " jense"] if female else [" er", " dieser", " jener"] 
    tokens = list(map(model.tokenizer.encode, list(map(lambda name: " " + name, names)) + pronouns))
    twos = [items for items in tokens if len(items) > 1]
    twos_one = [item[0] for item in twos]
    twos_two = [item[1] for item in twos]
    all_tokens = list(range(tokenizer.vocab_size))
    tokens = [item[0] for item in tokens]
    def constrainer(batch_id, input_tokens):
        if (input_tokens[-2] == weil[-2]) and (input_tokens[-1] == weil[-1]):
            return tokens
        elif (input_tokens[-3] == weil[-2]) and (input_tokens[-2] == weil[-1]) and (input_tokens[-1] in twos_one):
            return [twos_two[twos_one.index(input_tokens[-1])]]
        else:
            return all_tokens
    return constrainer

In [20]:
import pandas as pd
from Annotation import do_annotation

In [4]:
df = pd.read_csv("../data/coreference--ai-forever--mGPT.csv", sep=";")

In [9]:
df = df.rename(columns={"np1":"NP1", "np2":"NP2", "cat":"verbclass", "continuation":"cont"})

In [10]:
df["type"] = ["Experiment"] * len(df)

In [14]:
df["cont"] = df.apply(lambda row: row["cont"][len(row["prompt"]) + 1:], axis=1)

In [16]:
df["NP1gender"] = df["female"].apply(lambda b: "f" if b else "m")

In [29]:
"../data/coreference--ai-forever--mGPT.csv"[:-4] + "_annotated.csv"

'../data/coreference--ai-forever--mGPT_annotated.csv'

In [26]:
do_annotation(df[:100], True).to_csv("hi.csv", index=False, sep=";") 