In [1]:
from datasets import DatasetDict
from src.gpt2 import Generator
from random import randrange
import pandas as pd


generator = Generator(MODEL_PATH="./models/gpt2/final")
generator.tokenizer.padding_side = "left"

train = pd.read_csv("./data/preprocessed/train.csv", index_col='Unnamed: 0')[['text']]
validation = pd.read_csv("./data/preprocessed/validation.csv", index_col='Unnamed: 0')[['text']]

train['fake'] = False
validation['fake'] = False

train_texts = train['text'].values.tolist()
validation_texts = validation['text'].values.tolist()

validation_texts

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


["<|endoftext|> We don't care about characters because they are good people.  We care about people because they are struggling to achieve something that they desperately want and the world isn't letting them have it.  The more you stomp on them, the more we will care about them.  They don't even have to be a good person.  In fact making them a bit more relatable and open to making serious mistakes in character, is something that we actually respond positively to as long as they ultimately have a good heart.  \n\nIf you are not putting this character through so much that you would hate to actually be here and feel badly for her, then you are not going far enough.  Step 1 in a good story is to ruin your MC's life. <|reply|> What does your MC want? And what obstacles is she battling to overcome those obstacles? There's a good [Youtube video](https://youtu.be/oFaaxc1f1-M) about making audiences empathize with screenplay characters. Show the character unjustly mistreated, neglected, betraye

In [2]:
def create_fakes(input_texts):
    MAX_NEW_TOKENS = 40
    prompt_texts = [input_text.split(generator.SEP_TOKEN)[0]+generator.SEP_TOKEN for input_text in input_texts]       
    model_inputs = generator.tokenizer(prompt_texts, return_tensors='pt', padding=True, truncation=True, max_length=1024-MAX_NEW_TOKENS).to(generator.torch_device)

    sample_outputs = generator.model.generate(
        **model_inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        early_stopping=True,
        top_k=50,
        top_p=0.8,
        temperature=0.95,
        pad_token_id=generator.tokenizer.pad_token_id,
    )
    generated_texts = generator.tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    del model_inputs, sample_outputs
    return generated_texts

def batch_generations(texts, batch_size):
    for i in range(0, len(texts), batch_size): 
        yield texts[i:i + batch_size]

# Split into batches of 10
batch_size = 5
num_train_fakes = int(len(train_texts)/2)
num_validation_fakes = int(len(validation_texts)/2)
train_batches = list(batch_generations(train_texts[:num_train_fakes], batch_size))
validation_batches = list(batch_generations(validation_texts[:num_validation_fakes], batch_size))


In [4]:
def batch_generate_fakes(batches):
    fakes_dict = {
        'text': [],
        'fake': True,
    }

    num_batches = len(batches)
    for i, batch in enumerate(batches):
        fake_batch = create_fakes(batch)
        if (i%50==49):
            print(f"Batch {i} of {num_batches}")
            print(fake_batch)  
            return fakes_dict
        fakes_dict['text'].extend(fake_batch)

    return fakes_dict

validation_fakes = pd.DataFrame(batch_generate_fakes(validation_batches))
validation.iloc[0:len(validation_fakes)] = validation_fakes
validation.to_csv(generator.DATA_PROC_DIR + "/validation_fakes.csv")

train_fakes = pd.DataFrame(batch_generate_fakes(train_batches))
train.iloc[0:len(train_fakes)] = train_fakes
train.to_csv(generator.DATA_PROC_DIR + "/train_fakes.csv")

Batch 49 of 1388
[' Is this a full sentence? Then Sheila, holding back tears,   uttered the  words that would ruin Sam\'s marriage: ""wubalubadubdub" \n\n&#x200B;\n\nOr is it a fragment? The context is:\n\nAfter a seven hour flight, Sheila and her father were standing on the   porch,  asking Sam if he was ok. Then Sheila, holding back tears,   uttered the  words that would ruin Sam\'s marriage: ""wubalubadubdub" <|reply|> I have no idea. I’ve heard of this in the comments. I can’t find it, but I would like to know how the ending felt to me.   ', " Once you get into several hundred you might start to run into trouble. <|reply|> &gt; Once you get into several hundred you might start to run into trouble.\n\nI think I've seen this before.  \n\nIt's also really interesting to see how this", " Have you ever seen a counselor or therapist? I would suggest that. Depression takes away passion and motivation, if you haven't been bothered before that's great but it sounds like it's getting worse. 