In [33]:
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, set_seed
import torch
import numpy as np
import re
import html

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Loads models

In [7]:
tokenizer_gpt = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer_bert = AutoTokenizer.from_pretrained('distilbert-base-cased', do_lower_case=True) # Need to retrain with do_lower_case=False

generator = AutoModelForCausalLM.from_pretrained('../models/gpt2/final', pad_token_id=tokenizer_gpt.eos_token_id).to(device)
discriminator = AutoModelForSequenceClassification.from_pretrained('../models/bert_discriminator/final').to(device)
predictor = AutoModelForSequenceClassification.from_pretrained('../models/bert_predictor/final').to(device)

# Generate candidates

In [17]:
# set seed to reproduce results. Feel free to change the seed though to get different results

# encode context the generation is conditioned on
sep_token = "<|reply|>"
input_text = "Oh nice! I got around this (kind of) by being a bio major with a minor in neuro research. My school had 2 neruo programs - bio based and psych based. My minor was in psych based neuro but I took my bio electives as bio based neuro courses. Still had to do calc 1, calc 2, chem 1, chem 2, orgo 1, orgo 2, physics 1, and physics 2. But I DIDNT have to take intro to pharma kinetics, inorganic chemistry and a few other higher level chem classes. I did this bc the psych based neuro courses had almost no bio and I love bio. But math and chem are my kryptonite."
model_inputs = tokenizer_gpt([" ".join([input_text, sep_token])], return_tensors='pt').to(device)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = generator.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    early_stopping=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8,
    num_return_sequences=20,
)

print("Output:\n" + 100 * '-')
texts = []
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer_gpt.decode(sample_output, skip_special_tokens=False).split('<|reply|>')[1].split('\n')[0][1:]
    texts.append(text)
    print(f"{i}: {text}\n")



Output:
----------------------------------------------------------------------------------------------------
0: This is what I'm trying to say. 

1: &gt;My minor was in psych based neuro but I took my bio electives as bio based neuro courses. Still had to do calc 1, calc 2, chem 1, orgo 2,

2: I was in grad school. A while back I did some math and bio because it was good   

3: This is one of my favorites!  

4: How was bio based? It's basically a combination of bio based and psych based and psych based, and all bio based. 

5: It is still my best time to study chemistry and bio at my university. I would love to study bio as well.     

6: That's what I did! I got around 1, calc 2, chem 2, chem 1, chem 2, orgo 2, physics 1, and physics 2. But I didn't get around

7: Also the 2 major programs taught me how to draw attention to the physical structure and structure of the brain.  I had no idea what a neuroscience major was.  I am very interested in neuroscience but

8: So what is the bio?

#### Clean candidates

In [18]:
def regex_text(text):
    text = html.unescape(text)
    text = re.sub(r"\\'", r"'", text)
    text = re.sub(r"\s+$", '', text)    
    return text

texts = [regex_text(text) for text in texts[:]]
texts

["This is what I'm trying to say.",
 '>My minor was in psych based neuro but I took my bio electives as bio based neuro courses. Still had to do calc 1, calc 2, chem 1, orgo 2,',
 'I was in grad school. A while back I did some math and bio because it was good',
 'This is one of my favorites!',
 "How was bio based? It's basically a combination of bio based and psych based and psych based, and all bio based.",
 'It is still my best time to study chemistry and bio at my university. I would love to study bio as well.',
 "That's what I did! I got around 1, calc 2, chem 2, chem 1, chem 2, orgo 2, physics 1, and physics 2. But I didn't get around",
 'Also the 2 major programs taught me how to draw attention to the physical structure and structure of the brain.  I had no idea what a neuroscience major was.  I am very interested in neuroscience but',
 'So what is the bio?',
 'Can you help me explain to me how bio based chemistry courses are considered a no go?',
 'This is excellent.',
 "Not sur

# Discriminate candidates
1 = Fake
0 = Real

In [21]:
realistic_texts = []
for text in texts:
    test_input = tokenizer_bert(text, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = discriminator(**test_input).logits

    predicted_class_id = logits.argmax().item()
    if not predicted_class_id:
        realistic_texts.append(text)
realistic_texts  

['I was in grad school. A while back I did some math and bio because it was good',
 'Can you help me explain to me how bio based chemistry courses are considered a no go?',
 'There are many bio based neuroscience courses out there. My math class is pretty fun too.']

# Predict best candidate

In [43]:
scores = []
for text in realistic_texts:
    test_input = tokenizer_bert(text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = predictor(**test_input)

    scores.append(output.logits[0][0].cpu().numpy())

output_text = realistic_texts[np.argmax(scores)]
output_text

'There are many bio based neuroscience courses out there. My math class is pretty fun too.'

# Result

In [49]:
result = {
    'comment': input_text,
    'reply': output_text,
}
result

{'comment': 'Oh nice! I got around this (kind of) by being a bio major with a minor in neuro research. My school had 2 neruo programs - bio based and psych based. My minor was in psych based neuro but I took my bio electives as bio based neuro courses. Still had to do calc 1, calc 2, chem 1, chem 2, orgo 1, orgo 2, physics 1, and physics 2. But I DIDNT have to take intro to pharma kinetics, inorganic chemistry and a few other higher level chem classes. I did this bc the psych based neuro courses had almost no bio and I love bio. But math and chem are my kryptonite.',
 'reply': 'There are many bio based neuroscience courses out there. My math class is pretty fun too.'}