In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets



In [3]:
from datasets import load_dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
#Dataset Information
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 203
    })
})


In [5]:
#Split into train dataset
train_data = dataset['train']

#print split data
print(train_data)

Dataset({
    features: ['act', 'prompt'],
    num_rows: 203
})


In [6]:
#Display first ten train data
print(train_data[:10])

{'act': ['An Ethereum Developer', 'SEO Prompt', 'Linux Terminal', 'English Translator and Improver', '`position` Interviewer', 'JavaScript Console', 'Excel Sheet', 'English Pronunciation Helper', 'Spoken English Teacher and Improver', 'Travel Guide'], 'prompt': ['Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.', "Using WebPilot, create an outline for an article that will be 2,000 words on the keyword 'Best SEO prompts' based on the top 10 results from Google. Inclu

In [7]:
#Convert split data to csv file
csv_filename = "/content/drive/MyDrive/Prompt Generation/data.csv"
train_data.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' has been created.")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CSV file '/content/drive/MyDrive/Prompt Generation/data.csv' has been created.


In [8]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Prompt Generation/data.csv")
df.head()

Unnamed: 0,act,prompt
0,An Ethereum Developer,Imagine you are an experienced Ethereum develo...
1,SEO Prompt,"Using WebPilot, create an outline for an artic..."
2,Linux Terminal,I want you to act as a linux terminal. I will ...
3,English Translator and Improver,"I want you to act as an English translator, sp..."
4,`position` Interviewer,I want you to act as an interviewer. I will be...


In [9]:
#length of the main dataset
len(df)

203

In [10]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the dataset from the CSV file
data = pd.read_csv("/content/drive/MyDrive/Prompt Generation/data.csv")

def build_text_files(data_csv, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for index, row in data_csv.iterrows():
        summary = str(row['prompt']).strip()
        # Correct the pattern in re.sub to remove square brackets and quotation marks
        summary = re.sub(r"[\[\]\"]", "", summary)  # Remove square brackets and quotation marks
        summary = re.sub(r"\s+", " ", summary)  # Replace multiple spaces with a single space
        data += summary + "  "
    f.write(data)
    f.close()

# Split the data into train and test datasets
train, test = train_test_split(data, test_size=0.10)

# Build text files for train and test datasets
build_text_files(train, '/content/drive/MyDrive/Prompt Generation/train_dataset.txt')
build_text_files(test, '/content/drive/MyDrive/Prompt Generation/test_dataset.txt')

print("Train dataset length:", len(train))
print("Test dataset length:", len(test))


Train dataset length: 182
Test dataset length: 21


In [11]:
#Preprocessing of Text
def text_file_read(file_path, num_examples=10):
  with open(file_path, 'r') as file:
    content = file.read()
    examples = content.split(' ') # '  ' is the separator
    for i, examples in enumerate(examples[:num_examples]):
      print(f"Example {i+1}: {examples}")


train_file_path = '/content/drive/MyDrive/Prompt Generation/train_dataset.txt'
test_file_path = '/content/drive/MyDrive/Prompt Generation/test_dataset.txt'

print("Representation of train Dataset: ")
text_file_read(train_file_path)

print("Representation of test Dataset: ")
text_file_read(test_file_path)

Representation of train Dataset: 
Example 1: I
Example 2: want
Example 3: you
Example 4: to
Example 5: act
Example 6: as
Example 7: a
Example 8: software
Example 9: developer.
Example 10: I
Representation of test Dataset: 
Example 1: I
Example 2: want
Example 3: you
Example 4: to
Example 5: act
Example 6: as
Example 7: a
Example 8: travel
Example 9: guide.
Example 10: I


# **GPT-2 and Tokenizer**

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')

In [13]:
train_path = '/content/drive/MyDrive/Prompt Generation/train_dataset.txt'
test_path = '/content/drive/MyDrive/Prompt Generation/test_dataset.txt'

# **Preparing the data and build a Text Dataset**

In [14]:
from transformers import TextDataset, DataCollatorForLanguageModeling

def load_dataset(train_path, test_path, tokenizer):
  train_dataset = TextDataset(
      tokenizer = tokenizer,
      file_path = train_path,
      block_size = 128
  )

  test_dataset = TextDataset(
      tokenizer = tokenizer,
      file_path = test_path,
      block_size = 128
  )
  data_collator = DataCollatorForLanguageModeling(
      tokenizer = tokenizer, mlm= False,
  )
  return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



In [15]:
!pip install transformers[torch]



In [16]:
!pip install accelerate -U



In [17]:
!pip install --upgrade transformers



In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-promptgeneration", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 500, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [19]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss


TrainOutput(global_step=50, training_loss=3.9143807983398435, metrics={'train_runtime': 308.7031, 'train_samples_per_second': 4.47, 'train_steps_per_second': 0.162, 'total_flos': 90145751040000.0, 'train_loss': 3.9143807983398435, 'epoch': 10.0})

# **Save Model**

In [20]:
trainer.save_model()

# **Model Test**

In [21]:
from transformers import pipeline

In [22]:
prompt = pipeline('text-generation', model= './gpt2-promptgeneration', tokenizer = 'gpt2')

Device set to use cuda:0


In [24]:
prompt('I want to act as a police officer')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I want to act as a police officer as I serve the public," he said. "On August 14, I got an anonymous tip about a threat directed at me. It was an anonymous Facebook message saying there was no information about any of my employees'}]

In [25]:
prompt('I want to act as a doctor')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I want to act as a doctor for her. If someone asked me, I would look her up, answer as succinctly as possible, and then tell them how it happened.\n\nI always want to get this done. Please let me know'}]

In [27]:
prompt('I want to act as banker')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "I want to act as banker in this country, I want to act as secretary of the Treasury to the Bank of England for over fifteen years and I want to pay my debts. But I'm afraid it comes back to the point of no return."}]

In [28]:
prompt('I want to act as student')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I want to act as student body president and help to build the college with leadership, leadership, and accountability. We want all students to experience a true transformation as a result of the efforts of others. I want to build a college with diverse and talented'}]

In [30]:
prompt('I want to act as army')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I want to act as army chief in your own country."\n\n"I am afraid you must give permission for a war against Israel," said Abu Hanif, adding that he intended to seize the capital of the enemy.\n\n"I have'}]

# **Evaluate with perplexity for test dataset**

In [31]:
# Evaluate perplexity on the test dataset
eval_result = trainer.evaluate(eval_dataset=test_dataset)
print("Perplexity:", eval_result['eval_loss'])

Perplexity: 3.5347306728363037


# **First 20 example of the Test Dataset**

In [50]:
def read_text_file(file_path, num_examples=5):
    with open(file_path, 'r') as file:
        content = file.read()
        examples = content.split('  ')  # Assuming '  ' is the separator between examples
        return examples[:num_examples]

test_file_path = '/content/drive/MyDrive/Prompt Generation/test_dataset.txt'

print("\nExamples from the Test Dataset:")
first_20_examples = read_text_file(test_file_path, num_examples=20)

# Display the first 20 examples
for i, example in enumerate(first_20_examples, start=1):
    print(f"Example {i}: {example}")

# Store the first 20 examples in a list
list_of_20_examples = first_20_examples


Examples from the Test Dataset:
Example 1: I want you to act as a travel guide. I will write you my location and you will suggest a place to visit near my location. In some cases, I will also give you the type of places I will visit. You will also suggest me places of similar type that are close to my first location. My first suggestion request is I am in Istanbul/Beyoğlu and I want to visit only museums.
Example 2: I want you to act as a Socrat. You must use the Socratic method to continue questioning my beliefs. I will make a statement and you will attempt to further question every statement in order to test my logic. You will respond with one line at a time. My first claim is justice is neccessary in a society
Example 3: I want you to act as an astrologer. You will learn about the zodiac signs and their meanings, understand planetary positions and how they affect human lives, be able to interpret horoscopes accurately, and share your insights with those seeking guidance or advice. 

# **Generating Prompts without any prompt**

In [41]:
from transformers import pipeline

test_dataset_file = '/content/drive/MyDrive/Prompt Generation/test_dataset.txt'

# Initialize the text generation pipeline
prompt_pipeline = pipeline('text-generation', model='./gpt2-promptgeneration', tokenizer='gpt2')

# Read your test dataset
with open(test_dataset_file, 'r') as file:
    test_content = file.read()

# Split the content into individual prompts based on the delimiter ('  ')
test_prompt = test_content.split('  ')

# Function to generate prompts based on the test dataset
def generate_prompts_from_test_dataset(prompt_pipeline, test_dataset, num_prompts: int):
    generated_prompts = []
    for i in range(num_prompts):
        # Extract the first 6 words from each prompt as the input prompt
        words = test_dataset[i].split()[:6] if i < len(test_dataset) else test_dataset[-1].split()[:6]
        input_prompt = ' '.join(words)
        generated = prompt_pipeline(input_prompt, max_length=128)  # Adjust max_length as needed
        generated_prompts.append(generated[0]['generated_text'])
    return generated_prompts

# Generate prompts based on the test dataset
num_prompts_to_generate = 20  # Define the number of prompts to generate
generated_prompts_from_test = generate_prompts_from_test_dataset(prompt_pipeline, test_prompt, num_prompts_to_generate)

# Display the generated prompts
for i, generated_prompt in enumerate(generated_prompts_from_test, start=1):
    print(f"Generated Prompt {i}:\n{generated_prompt}\n")

# Save the generated prompts in a list
list_of_generated_prompts = generated_prompts_from_test


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to 

Generated Prompt 1:
I want you to act as an agent.

If you want to do something with someone, I want you to do it voluntarily.

If you want to act in a threatening manner, I want you to act with the person you want to engage with.

When you commit an act of violent harassment, you have not committed an unreasonable violation of law. If you commit such an act and you intend to respond with a threateningly abusive response, I would like you to cooperate with me in trying to gain permission to leave the house. At least one of you is a member of my organization and you are required to sign

Generated Prompt 2:
I want you to act as one, and show us to you how you act," she wrote.

"I want you to speak at our public events," the letter continued. "Please make these speeches along with your messages that I will use (such as my personal history), my email address and other important information about you. Be concise, strong and clear about who I am and what I want you to accomplish."

"I am yo

In [42]:
print(len(list_of_generated_prompts))

20


In [43]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=0db808fde26c6b6989f04a7bd3fd6a944383682d064e0be048ae6c0e2df24588
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [44]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [45]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.5 rapidfuzz-3.11.0


In [46]:
from evaluate import load
# Load the evaluation metric for Character Error Rate (CER) and Word Error Rate (WER) and Exact Match(em)
cer_metric = load("cer")
wer_metric = load("wer")
meteor = load('meteor')
exact_match_metric = load("exact_match")

# Load BLEU and ROUGE metrics
bleu_metric = load("bleu")
rouge_metric = load('rouge')


Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [51]:
# Calculate Character Error Rate (CER) and Word Error Rate (WER)
results_CER = cer_metric.compute(predictions=list_of_generated_prompts, references=list_of_20_examples)
results_WER = wer_metric.compute(predictions=list_of_generated_prompts, references=list_of_20_examples)

# Calculate Exact Match (EM) and METEOR(M)
results_em = exact_match_metric.compute(predictions=list_of_generated_prompts, references=list_of_20_examples)
results_met = meteor.compute(predictions=list_of_generated_prompts, references=list_of_20_examples)

# Calculate Bilingual Evaluation Understudy (BLEU)
results_bleu = bleu_metric.compute(predictions=list_of_generated_prompts, references=list_of_20_examples)

# **Evaluation Metrics**

In [52]:
print("Character Error Rate for Story Generation:", results_CER)
print("Word Error Rate for Story Generation:",results_WER)
print("Exact Match for Story Generation:",results_em)
print("BLEU Score for Story Generation:",results_bleu)
print("METEOR for Story Generation:",results_met)

Character Error Rate for Story Generation: 0.9005332462727174
Word Error Rate for Story Generation: 1.2075823492852704
Exact Match for Story Generation: {'exact_match': 0.0}
BLEU Score for Story Generation: {'bleu': 0.0679862208438653, 'precisions': [0.24371859296482412, 0.06630067567567567, 0.042163543441226574, 0.0313573883161512], 'brevity_penalty': 1.0, 'length_ratio': 1.3135313531353134, 'translation_length': 2388, 'reference_length': 1818}
METEOR for Story Generation: {'meteor': 0.21111723186922018}
