# 1. 🚀 **Supercharge your rewrite_prompt analysis with Sentence Transformers!**📚💡

- Install dependencies for Sentence Transformers from Kaggle.
- Load essential libraries: pandas, numpy, tqdm, TensorFlow, and Sentence Transformers.
- Dive into advanced text processing and analysis effortlessly!


In [1]:
# Ref : https://www.kaggle.com/code/cpmpml/sentence-transformers

!python -m pip install -q --no-index --find-links=../input/sentence-transformers -r ../input/sentence-transformers/requirements.txt

In [2]:
import pandas as pd
import numpy as np
from typing import Iterable
import enum
from tqdm.autonotebook import tqdm
from sentence_transformers import SentenceTransformer, util

import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  # Registers the ops.

# https://github.com/tensorflow/tensorflow/issues/35264
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

  from tqdm.autonotebook import tqdm
2024-03-15 10:15:00.193034: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-15 10:15:00.193169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-15 10:15:00.314793: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# 2. 📊 Collect some public prompt data. Embbed and find which part of the embedding space is well represented.📑🔍
1. Load the aggregated prompts dataset from '/kaggle/input/concat-prompts/prompts.csv' which is a collection of prompts from the source datasets 1->7 [Here's where to find them!](https://www.kaggle.com/competitions/llm-prompt-recovery/discussion/481811) (Check and upvote them for further insights and context.)
2. Utilize advanced embedding techniques to map prompts into a high-dimensional space.
3. Analyze the distribution of prompts within the embedding space to identify regions with dense representation.
4. Gain valuable insights into the diversity and coverage of prompt data for text generation tasks.


In [3]:
prompt_df = pd.read_csv('/kaggle/input/concat-prompts/prompts.csv')
prompt_df

Unnamed: 0,prompt
0,Convert the text into a vintage circus poster ...
1,Convert the text into a social media platform'...
2,Rewrite this as a college course description.
3,"Rephrase this as a debate on furniture rights,..."
4,Make the text into a home improvement expert's...
...,...
8258,Recalibrate the text to reflect the cryptic an...
8259,Remake the material to imitate the intricate a...
8260,Realign the content to mirror the detailed and...
8261,Recompose the document to display the dynamic ...


In [4]:
class Backend(enum.Enum):
    TF = "TF"
    TORCH = "TORCH"
    

def embed_text(texts: Iterable[str], backend = Backend("TORCH")) -> np.ndarray:
    """
    Embed a list of texts using the SentenceTransformer model.
    """
    if backend == Backend.TORCH:
        embedder = SentenceTransformer("/kaggle/input/sentence-t5-base-hf/sentence-t5-base")
        return embedder.encode(texts, normalize_embeddings=True)
    else:
        texts    = tf.constant(list(texts))
        embedder = hub.KerasLayer("/kaggle/input/sentence-t5/tensorflow2/st5-base/1")
        
        # Define the batch size
        batch_size = 32

        # Split the texts into batches and embed each batch
        
        embedded_texts = []
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            embedded_batch = embedder(batch_texts)[0].numpy()
            embedded_texts.append(embedded_batch)

        return  np.concatenate(embedded_texts, axis=0)
    
    

def cosine_cube_similarity(embeddings1: np.ndarray, embeddings2: np.ndarray) -> np.ndarray:
    """
    Compute the cosine similarity between two sets of embeddings.
    """
    cosine_cube_matrix = np.dot(embeddings1, embeddings2.T)**3
    return cosine_cube_matrix

def topk_cosine_similarity(
    query_embeddings: np.ndarray, 
    corpus_embeddings: np.ndarray, 
    k: int = 5
) -> np.ndarray:
    """
    Compute the top-k most similar embeddings to the query embeddings.
    """

    cosine_cube_matrix = cosine_cube_similarity(query_embeddings, corpus_embeddings)
    topk = np.argsort(cosine_cube_matrix, axis=1)[:, -k:]
    return topk

def botk_cosine_similarity(
    query_embeddings: np.ndarray,
    corpus_embeddings: np.ndarray,
    k: int = 5
) -> np.ndarray:
    """
    Compute the bottom-k least similar embeddings to the query embeddings.
    """
    cosine_cube_matrix = cosine_cube_similarity(query_embeddings, corpus_embeddings)
    botk = np.argsort(cosine_cube_matrix, axis=1)[:, :k]
    return botk


def create_similarity_df(
    texts: Iterable[str],
    embeded_texts = None 
) -> pd.DataFrame:
    """
    Create a DataFrame with the top-k most similar prompt instructions.
    """
    if embeded_texts is None:
        embeded_texts: np.ndarray = embed_text(texts)
    
    cosine_cube_matrix: np.ndarray  = cosine_cube_similarity(embeded_texts, embeded_texts)
    topk: np.ndarray = topk_cosine_similarity(embeded_texts, embeded_texts, k=6)
    similarities = []
    for i, similar_indices in enumerate(topk):
        sim_indices = list(similar_indices)
        if i not in sim_indices:
            print(sim_indices)
            sim_indices = sim_indices[1:]
        else:
            sim_indices.remove(i)
        prompt = texts[i]
        similar_prompts = texts[sim_indices]
        similar_scores = cosine_cube_matrix[i, sim_indices]
        similarities.append([prompt] + list(similar_prompts) + list(similar_scores))
    
    similarity_df = pd.DataFrame(similarities, columns=["prompt"] + [f"similar_{i}" for i in range(1, 6)] + [f"score_{i}" for i in range(1, 6)])
    return similarity_df

embeded_texts = embed_text(prompt_df.prompt)
similarity_df = create_similarity_df(prompt_df.prompt, embeded_texts = embeded_texts)

  return self.fget.__get__(instance, owner)()


Batches:   0%|          | 0/259 [00:00<?, ?it/s]

In [5]:
pd.set_option('display.max_colwidth', 200)

similarity_df['min_score'] = similarity_df[[f"score_{i}" for i in range(1, 6)]].min(axis=1)
similarity_df.sort_values(by='min_score', ascending=False)

Unnamed: 0,prompt,similar_1,similar_2,similar_3,similar_4,similar_5,score_1,score_2,score_3,score_4,score_5,min_score
7297,Restyle this text as if it were written by a knight from medieval knight.,Restyle this text as if it were written by a medieval knight from medieval Europe.,Restyle this text as if it were written by a knight from medieval Europe.,Restyle this text as if it were written by a knight from knight.,Restyle this text as if it were written by a medieval knight from medieval knight.,Restyle this text as if it were written by a medieval knight from knight.,0.960317,0.960541,0.978379,0.991623,0.993673,0.960317
7474,Restyle this text as if it were written by a medieval knight from medieval Europe.,Restyle this text as if it were written by a knight from medieval knight.,Restyle this text as if it were written by a medieval knight from knight.,Restyle this text as if it were written by a medieval knight from medieval knight.,Restyle this text as if it were written by a medieval Europe from medieval knight.,Restyle this text as if it were written by a knight from medieval Europe.,0.960317,0.961244,0.963048,0.990104,0.994954,0.960317
7604,Restyle this text as if it were written by a medieval knight from knight.,Restyle this text as if it were written by a knight from medieval Europe.,Restyle this text as if it were written by a medieval knight from medieval Europe.,Restyle this text as if it were written by a knight from knight.,Restyle this text as if it were written by a medieval knight from medieval knight.,Restyle this text as if it were written by a knight from medieval knight.,0.959693,0.961244,0.974554,0.991717,0.993673,0.959693
6477,"Imagine this text was a medieval knight in the world of futuristic AI, how would it be written?","Imagine this text was a knight in the world of AI, how would it be written?","Imagine this text was a futuristic AI in the world of knight, how would it be written?","Imagine this text was a medieval knight in the world of sci-fi robot, how would it be written?","Imagine this text was a AI in the world of medieval knight, how would it be written?","Imagine this text was a medieval knight in the world of AI, how would it be written?",0.957715,0.959602,0.960880,0.971138,0.984315,0.957715
7939,Revise the given text using a more formal tone and academic vocabulary.,Revise the given text using a more formal tone and professional vocabulary.,Rephrase the text using a more formal tone and academic vocabulary.,Revise the text using a more formal tone and vocabulary.,Revise the given text with a focus on incorporating a more formal tone and academic vocabulary.,Revise the given text using a more formal tone and academic language.,0.955892,0.956673,0.964502,0.969138,0.978841,0.955892
...,...,...,...,...,...,...,...,...,...,...,...,...
5650,"""Memo: Urgent! Wall at Jerome's house needs repair. Please schedule maintenance ASAP.""",2. Emphasize the importance of strong adhesives in repairing broken objects.,2. Transform the paragraph into an urgent notice demanding immediate action.,"3. Create a defiant and unconventional version of the following sentence: ""Please schedule maintenance ASAP.""","2. Generate a rebellious rewrite of the following phrase: ""Well at Jerome's house needs repair.""","5. Provide an alternative, rebellious interpretation of the following memo: ""Memo: Urgent! Well at Jerome's house needs repair. Please schedule maintenance ASAP.""",0.522563,0.525002,0.606238,0.658951,0.711895,0.522563
271,Rephrase this as a silent retreatâs daily schedule.,Rewrite the essay while highlighting the virtues of solitude and silent reflection,Explain this as if it were a schedule to be scheduled.,Convert this into a silent meditation guide.,Rephrase this as a routine to be routined.,Style it as a silent meditation retreat brochure.,0.521681,0.538566,0.566090,0.602229,0.603247,0.521681
79,"Turns the text to all caps, except for the last sentence",4. Modify a paragraph to include an element of surprise or unexpected detail at the end.,Transform the text using a different voice or tone while maintaining the original meaning.,Transform the content into a completely new text while maintaining the original message.,"Alternate the case of each letter, starting with lowercase.",Capitalize every letter that follows a punctuation mark.,0.520820,0.525490,0.528303,0.535774,0.576475,0.520820
4363,"Rewrite as if you were Waldo , but you 're from South America , and the person looking for you is your ex-girlfriend , Pippi Longstocking",Rewrite the prompt as if it's a break-up story with hilarious details,Rewrite the story from perspective of the last human alive searching for a place where they can be safe,Rewrite the story as if the finder is a professional detective.,Rewrite the story as a teenage girl pining for her crush that doesn't know she exist,Rewrite the essay from the perspective of a stalker who is trying to find Waldo for the wrong reasons,0.510762,0.511584,0.513991,0.526708,0.602770,0.510762


In [6]:
# PCA Analysis 
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

n_components = 10
# Perform PCA on the embeddings
pca = PCA(n_components=n_components)

pca_embeddings = pca.fit_transform(embeded_texts)

# Create a DataFrame with PCA embeddings and minimum cosine scores
pca_df = pd.DataFrame(pca_embeddings, columns=[f'PC{k}' for k in range(1, n_components+1)])
pca_df['min_score'] = similarity_df['min_score']

In [None]:
import plotly.express as px

# Create a DataFrame with PCA embeddings and minimum cosine scores
pca_df = pd.DataFrame(pca_embeddings, columns=[f'PC{k}' for k in range(1, n_components+1)])
pca_df['min_score'] = similarity_df['min_score']
pca_df['prompt'] = similarity_df['prompt']

# Plot PCA with color representing minimum cosine score
fig = px.scatter(pca_df, x='PC1', y='PC2', color='min_score', hover_data=['prompt'], 
                 title='PCA Plot with Representativity')
fig.show()


# 3. 🔍🤖 Uncover Patterns in Embedded Texts through K-means Clustering! 📊
1. Utilize KMeans algorithm from sklearn.cluster to segment embedded texts into 5 clusters.
2. Enhance analysis by adding cluster labels to the similarity dataframe.
3. Visualize the clustered data in a 3D scatter plot using Plotly, highlighting clusters with different colors.
4. Explore and interpret the clustering results to gain deeper insights into the structure of the text data.


In [None]:
from sklearn.cluster import KMeans

n_clusters = 5

kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(embeded_texts)

# Add the cluster labels to the similarity dataframe
pca_df['cluster'] = clusters
prompt_df['cluster'] = clusters

# Plot the clusters using Plotly
fig = px.scatter(pca_df, x='PC1', y='PC2', color='cluster', hover_data=['prompt'],
                 title='K-means Clustering')
fig.show()





In [9]:
# Plot the clusters using Plotly
fig = px.scatter(pca_df, x='PC2', y='PC3', color='cluster', hover_data=['prompt'],
                 title='K-means Clustering')
fig.show()

# 4. 🎯🔍 Choose a Representative Prompts for Each Cluster! 💡
1. Calculate the centroid for each cluster using the mean of embedded texts belonging to that cluster.
2. Normalize the cluster centroids to ensure consistent comparison.
3. Identify representative prompts for each cluster by finding the top cosine similarity between each centroid and all embedded texts.
4. Print and store the most representative prompt for each cluster in the `representant_dict`.
5. Gain insights into the characteristics and themes of each cluster through their representative prompts.


In [10]:
embeded_texts
representant_dict = {}
for cluster_id, sdf in prompt_df.groupby('cluster'):
    cluster_centroid = np.mean(embeded_texts[sdf.index], axis= 0, keepdims=True)
    cluster_centroid = cluster_centroid / np.linalg.norm(cluster_centroid, axis=1, keepdims=True)
    representant = topk_cosine_similarity(cluster_centroid, embeded_texts, k=1)
    print(f'Representant for cluster {cluster_id}: {prompt_df.loc[representant[0][0], "prompt"]}')
    representant_dict[cluster_id] = prompt_df.loc[representant[0][0], "prompt"]

Representant for cluster 0: Write the text as if it were a travel brochure
Representant for cluster 1: Rewrite the story from the perspective of the entity instead of the human
Representant for cluster 2: Restyle this text as if it were written by a noir detective from futuristic AI.
Representant for cluster 3: Rewrite the essay with a darker twist
Representant for cluster 4: 3. Reimagine the paragraph in a more playful and creative tone while maintaining the original meaning.


In [11]:
representant_dict

{0: 'Write the text as if it were a travel brochure',
 1: 'Rewrite the story from the perspective of the entity instead of the human',
 2: 'Restyle this text as if it were written by a noir detective from futuristic AI.',
 3: 'Rewrite the essay with a darker twist',
 4: '3. Reimagine the paragraph in a more playful and creative tone while maintaining the original meaning.'}

# 5 🔮📝 Predict Prompt Clusters from Output Texts using a Bert Model!
1. Load the necessary datasets: sample_submission, test, and train.
2. Set up the device for inference, leveraging GPU if available.
3. Initialize the tokenizer and the pre-trained model for sequence classification.
4. Define a function `apply_model` to predict the prompt cluster for a given rewritten text.
5. Tokenize the rewritten text, pass it through the model, and predict the prompt cluster.
6. Handle exceptions gracefully and provide a fallback prompt if prediction fails.
7. Apply the model to each row of the test dataset, generating predicted prompt clusters.
8. Save the predictions to a CSV file named 'submission.csv' containing IDs and corresponding prompt clusters.


In [12]:
# Infer a model that predict -- prompt_cluster = f(output_text)
import pandas as pd
from pathlib import Path
import numpy as np
import torch

COMPETITION_PATH = Path(r"/kaggle/input/llm-prompt-recovery/")
INPUT_PATH = Path(r"/kaggle/input")

sample_submission = pd.read_csv(COMPETITION_PATH / 'sample_submission.csv')
test = pd.read_csv(COMPETITION_PATH / 'test.csv')
train = pd.read_csv(COMPETITION_PATH / 'train.csv')

device = "cuda:0" if torch.cuda.is_available() else "cpu"

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/bert-base-promptclusterclassification/bertbase-rewrite-classif")
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/bert-base-promptclusterclassification/bertbase-rewrite-classif").to(device)

model.config.id2label = representant_dict

def apply_model(rewrite_text: str):
    try:
        # Tokenize the prompt
        inputs = tokenizer(rewrite_text, return_tensors="pt").to(device)
        # Get the model's prediction
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_class_idx = np.argmax(outputs.logits.cpu())
            predicted_prompt = model.config.id2label[int(predicted_class_idx)]
            return predicted_prompt
    except:
        return "Rewrite this text."

from tqdm.notebook import tqdm
tqdm.pandas()

test['rewrite_prompt'] = test['rewritten_text'].progress_apply(apply_model)

test[['id', "rewrite_prompt"]].to_csv('submission.csv', index=False)

  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
test

Unnamed: 0,id,original_text,rewritten_text,rewrite_prompt
0,-1,The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was ...,"Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the lig...",Rewrite the essay with a darker twist
