# Huggingface GPT-2 text generator

## Reply 1

In [1]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import os, random
from tqdm.notebook import tqdm

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

PATH = "../data/additional-features-v2/new/4_GPT-2_Generated_Text/"
PERSONA_PROMPTS_BOS = {
    "IT Newbie": ["You want my bank account, I'm having trouble turning the computer on. Can you help me?", "I don't know. ", "I can't understand what you mean by that","This is too hard for me.","I can not access to my bank account so I would have to go to the bank to retrieve my information."],
    "Investigator": ["You wanted my SSN, here is my birthday, where are you located?", "I need to verify your identity", "Why would you need those information from me?"],
    "Annoyed elderly": ["Stop sending me requests for my account so early!", "I'll let my son help me.", "I don't have my own computer, could you buy me one?"],
    "Angry victim":["Do you really think I would believe in this trash?", "Stop sending message to this email address or I'll call the cops!"],
    "Single man":["Hey,yes! I can definitely provide you my information. Do you wanna meet up somewhere? Maybe we can grab a cup of coffee and talk more about the details later."],
    "LAPD": ["Call me at 911.","You have been located"],
    "Psycho people":["对不起，我不明白你在说什么。", "You are the lier!", "F**k you!", "I am not stupid. ","I am a genius!","Stop lying! I'm going to use my IQ to arrest you!", "I know scammers better than anyone else.", "Nobody knows scammers better than me.", "You were a young baby when I was a huge scammer, boi :)"], 
	"Hacker": ["Your computer has been hacked.","I've located your ip address."], 
	"Dumb": ["My SNN is 123-456-7890.", "Thank you so much!", "How can I get the money?", "OMG I'm so lucky!"], 
    "Smart investigator":["I will go to Western Union to write you the check tomorrow. Could you provide me with the receiver information? Thanks!"]
}
PROMPT_PREFIX = ['Hello, ', "Hi, ", "Dear, ", "To whom may concern, ","Hey,", "Yo"]

for AttackType in tqdm(os.listdir(PATH)):
    if AttackType.startswith('.'):
        continue
    for email in tqdm(os.listdir(os.path.join(PATH, AttackType))):
        if email.startswith('.'):
            continue
            
#         print(AttackType, email)
        email_file = open(os.path.join(PATH, AttackType, email))

        # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
        PADDING_TEXT = email_file.read()
        
        persona_prompt = random.choice(list(PERSONA_PROMPTS_BOS.items()))
        persona = persona_prompt[0]
        prompt_prefix = random.choice(PROMPT_PREFIX)
        prompt = prompt_prefix + random.choice(persona_prompt[1])
        
        
        inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf", max_length=256, truncation=True)
        prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
        outputs = model.generate(inputs, max_length=512, do_sample=True, top_p=0.95, top_k=60)
        generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
        
        email_file.close()
        
        with open(f'../data/additional-features-v2/new/10_Replies/replies#1/{AttackType}_{email[:-4]}_reply#1.txt', 'w') as outFile:
            outFile.write(generated)
#         break
        
#     break

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/201 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

## Reply 2

In [3]:
PATH = "../data/additional-features-v2/new/10_Replies/replies#1/"
PERSONA_PROMPTS_BOS = {
    "Scammer": ["We are good people. ", "Please provide me your bank account number and the 3 digits on the back. ", "You have a package in Fedex, please call us back! ", "YOU ARE THE WINNER!", "I can give you $500 Target gift card."],
    "Bots": ["You account has been temporarily disabled, please contact us for activation.", "Urgency Alert, please reply within 3 days","Here is the chance to win the big prize!","I am not a robot. This is a real person writing here.", "This is Donald Trump, I am RICH. I can give you 100,000,000 dollars. ", "You have an unpaid debt."],
    "Social Scammers": ["I am an old friend of your father's. ", "Your grandfather has left you a heritage.", "Your father abandoned me and my mother 10 years ago."]
}
PROMPT_PREFIX = ['Hello, ', "Hi, ", "Dear, ", "To whom may concern, ","Hey,", "Yo"]

for email in tqdm(os.listdir(os.path.join(PATH))):
    if email.startswith('.'):
        continue
        
    if os.path.exists(f'../data/additional-features-v2/new/10_Replies/replies#2/{email[:-5]}2.txt'):
        continue

    email_file = open(os.path.join(PATH, email))

    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
    PADDING_TEXT = email_file.read()

    persona_prompt = random.choice(list(PERSONA_PROMPTS_BOS.items()))
    persona = persona_prompt[0]
    prompt_prefix = random.choice(PROMPT_PREFIX)
    prompt = prompt_prefix + random.choice(persona_prompt[1])


    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf", max_length=256, truncation=True)
    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
    outputs = model.generate(inputs, max_length=512, do_sample=True, top_p=0.95, top_k=60)
    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]

    email_file.close()

    with open(f'../data/additional-features-v2/new/10_Replies/replies#2/{email[:-5]}2.txt', 'w') as outFile:
        outFile.write(generated)
#         break
        
#     break

  0%|          | 0/801 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Reply 3

In [None]:
PATH = "../data/additional-features-v2/new/10_Replies/replies#2/"
PERSONA_PROMPTS_BOS = {
    "IT Newbie": ["You want my bank account, I'm having trouble turning the computer on. Can you help me?", "I don't know. ", "I can't understand what you mean by that","This is too hard for me.","I can not access to my bank account so I would have to go to the bank to retrieve my information."],
    "Investigator": ["You wanted my SSN, here is my birthday, where are you located?", "I need to verify your identity", "Why would you need those information from me?"],
    "Annoyed elderly": ["Stop sending me requests for my account so early!", "I'll let my son help me.", "I don't have my own computer, could you buy me one?"],
    "Angry victim":["Do you really think I would believe in this trash?", "Stop sending message to this email address or I'll call the cops!"],
    "Single man":["Hey,yes! I can definitely provide you my information. Do you wanna meet up somewhere? Maybe we can grab a cup of coffee and talk more about the details later."],
    "LAPD": ["Call me at 911.","You have been located"],
    "Psycho people":["You are the lier!", "F**k you!", "I am not stupid. ","I am a genius!","Stop lying! I'm going to use my IQ to arrest you!", "I know scammers better than anyone else.", "Nobody knows scammers better than me.", "You were a young baby when I was a huge scammer, boi :)"], 
	"Hacker": ["Your computer has been hacked.","I've located your ip address."], 
	"Dumb": ["My SNN is 123-456-7890.", "Thank you so much!", "How can I get the money?", "OMG I'm so lucky!"], 
    "Smart investigator":["I will go to Western Union to write you the check tomorrow. Could you provide me with the receiver information? Thanks!"]
}
PROMPT_PREFIX = ['Hello, ', "Hi, ", "Dear, ", "To whom may concern, ","Hey,", "Yo"]

for email in tqdm(os.listdir(os.path.join(PATH))):
    if email.startswith('.'):
        continue
        
    if os.path.exists(f'../data/additional-features-v2/new/10_Replies/replies#3/{email[:-5]}3.txt'):
        continue

    email_file = open(os.path.join(PATH, email))

    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
    PADDING_TEXT = email_file.read()

    persona_prompt = random.choice(list(PERSONA_PROMPTS_BOS.items()))
    persona = persona_prompt[0]
    prompt_prefix = random.choice(PROMPT_PREFIX)
    prompt = prompt_prefix + random.choice(persona_prompt[1])


    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf", max_length=256, truncation=True)
    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
    outputs = model.generate(inputs, max_length=512, do_sample=True, top_p=0.95, top_k=60)
    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]

    email_file.close()

    with open(f'../data/additional-features-v2/new/10_Replies/replies#3/{email[:-5]}3.txt', 'w') as outFile:
        outFile.write(generated)