In [2]:
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, set_seed
import torch
import numpy as np
import re
import html

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


# Loads models

In [3]:
tokenizer_gpt = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer_bert = AutoTokenizer.from_pretrained('distilbert-base-cased', do_lower_case=True) # Need to retrain with do_lower_case=False

generator = AutoModelForCausalLM.from_pretrained('./models/gpt2/final', pad_token_id=tokenizer_gpt.eos_token_id).to(device)
discriminator = AutoModelForSequenceClassification.from_pretrained('./models/bert_discriminator/final').to(device)
predictor = AutoModelForSequenceClassification.from_pretrained('./models/bert_predictor/final').to(device)

# Generate candidates

In [4]:
# set seed to reproduce results. Feel free to change the seed though to get different results

# encode context the generation is conditioned on
sep_token = "<|reply|>"
input_text = "Do you have any experience with neural networks?"
model_inputs = tokenizer_gpt([" ".join([input_text, sep_token])], return_tensors='pt').to(device)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = generator.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    early_stopping=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8,
    num_return_sequences=250)
print("Output:\n" + 100 * '-')
texts = []
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer_gpt.decode(sample_output, skip_special_tokens=False).split('<|reply|>')[1].split('\n')[0][1:]
    texts.append(text)
    print(f"{i}: {text}\n")





Output:
----------------------------------------------------------------------------------------------------
0: I'm a neural network expert and I've seen some papers that show how to use neural networks to improve their performance.  

1: [deleted]   

2: [deleted]  

3: You're describing a problem. You could imagine a human sitting in a chair, wondering if they could use a neural network to explain why the human is sitting there. 

4: [deleted]  

5: I've seen this in a lot of different ways.  

6: I'm not really sure what you mean by Neural Networks. 

7: I’ve been in the same situation for some time, but I’m not sure if it’s the same with deep learning. It is an area where people can try to

8: I think there are some things that can be learnt from these kinds of approaches in this post.

9: There is some deep learning on reddit that does this.   

10: I think it depends on what you are writing. 

11: I am not.

12: I think there is an [IMHO](https://en.wikipedia.org/wiki/IMHO) that 

#### Clean candidates

In [5]:
def regex_text(text):
    text = html.unescape(text)
    text = re.sub(r"\\'", r"'", text)
    text = re.sub(r"\s+$", '', text)    
    return text

texts = [regex_text(text) for text in texts[:]]
texts

["I'm a neural network expert and I've seen some papers that show how to use neural networks to improve their performance.",
 '[deleted]',
 '[deleted]',
 "You're describing a problem. You could imagine a human sitting in a chair, wondering if they could use a neural network to explain why the human is sitting there.",
 '[deleted]',
 "I've seen this in a lot of different ways.",
 "I'm not really sure what you mean by Neural Networks.",
 'I’ve been in the same situation for some time, but I’m not sure if it’s the same with deep learning. It is an area where people can try to',
 'I think there are some things that can be learnt from these kinds of approaches in this post.',
 'There is some deep learning on reddit that does this.',
 'I think it depends on what you are writing.',
 'I am not.',
 'I think there is an [IMHO](https://en.wikipedia.org/wiki/IMHO) that has an algorithm that learns a classification algorithm and then learns a function in the domain of',
 "I can't help but think you

# Discriminate candidates
1 = Fake
0 = Real

In [6]:
realistic_texts = []
for text in texts:
    test_input = tokenizer_bert(text, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = discriminator(**test_input).logits

    predicted_class_id = logits.argmax().item()
    if not predicted_class_id:
        realistic_texts.append(text)
realistic_texts  

['Neural networks are more like computers than computers',
 'yes',
 'yes',
 'Are you interested in deep learning? I have no idea.',
 'This, right here.',
 'This is very close to the answer']

# Predict best candidate

In [7]:
scores = []
for text in realistic_texts:
    test_input = tokenizer_bert(text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = predictor(**test_input)

    scores.append(output.logits[0][0].cpu().numpy())

output_text = realistic_texts[np.argmax(scores)]
output_text

'This, right here.'

# Result

In [8]:
result = {
    'comment': input_text,
    'reply': output_text,
}
result

{'comment': 'Do you have any experience with neural networks?',
 'reply': 'This, right here.'}