# Mock user data

Suppose we want to mock some data for testing purposes, and we know the schema, but don't have any data we can use. 

How can we use LLMs to generate this data for us?

## Set up + imports

In [1]:
from creds import openai_key # api keys etc.
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
import random

## Our use case

For our use case, let's generate customer feedback and NPS data.

In [2]:
# Use case config

data_type = "customer feedback"

business_type = "coffee shop"

data_output_fields = """
id: #numeric
customer_name: #first and last. Pick a name that has the favourite number of the id
score: #out of 10
comment: #can be empty, can be up to 2 sentences
date: #yyyy-mm-dd
"""

start_id = 1

num_rows_per_iteration = 3

seed = random.randint(1, 100000)


In [3]:
prompt = PromptTemplate(
    input_variables=["data_type", 
                     "business_type", 
                     "data_output_fields", 
                     "start_id", 
                     "num_rows_per_iteration",
                     "seed"],
    template="""
    Generate some {data_type} data for a {business_type}. Use lots of different personas. 
    
    Generate data randomly with a seed of {seed}. 
    
    Generate {num_rows_per_iteration} rows. Start from an ID of {start_id}.
    
    Your output should be json with the following fields:
    ```
    {data_output_fields}
    ```
    """,
)

In [4]:
# Set up the LLM chain

llm = ChatOpenAI(temperature=.7, openai_api_key=openai_key)
data_generation_chain = LLMChain(llm=llm, prompt=prompt)

In [5]:
# Preview the prompt
final_prompt = prompt.format(data_type=data_type,
                             business_type=business_type,
                             data_output_fields=data_output_fields,
                             start_id=start_id,
                             num_rows_per_iteration=num_rows_per_iteration,
                             seed = seed)

print(final_prompt)


    Generate some customer feedback data for a coffee shop. Use lots of different personas. 
    
    Generate data randomly with a seed of 44813. 
    
    Generate 3 rows. Start from an ID of 1.
    
    Your output should be json with the following fields:
    ```
    
id: #numeric
customer_name: #first and last. Pick a name that has the favourite number of the id
score: #out of 10
comment: #can be empty, can be up to 2 sentences
date: #yyyy-mm-dd

    ```
    


In [6]:
# Generate data
mock_data = data_generation_chain.run(data_type=data_type,
                                      business_type=business_type,
                                      data_output_fields=data_output_fields,
                                      start_id=start_id,
                                      num_rows_per_iteration=num_rows_per_iteration, 
                                      seed=seed)
print(mock_data)

[
    {
        "id": 1,
        "customer_name": "Oliver Wilson",
        "score": 8,
        "comment": "The latte was great, but the music was a bit too loud for my liking.",
        "date": "2022-08-01"
    },
    {
        "id": 2,
        "customer_name": "Emma Jones",
        "score": 9,
        "comment": "The cappuccino was perfect and the atmosphere was cozy and inviting.",
        "date": "2022-08-02"
    },
    {
        "id": 3,
        "customer_name": "William Taylor",
        "score": 6,
        "comment": "The coffee tasted burnt and the service was slow, but the pastries were delicious.",
        "date": "2022-08-03"
    }
]


In [7]:
# Convert json to dataframe
import json
import pandas as pd

# Create a DataFrame from the JSON string
mock_df = pd.DataFrame(json.loads(mock_data))

# Display the DataFrame
mock_df

Unnamed: 0,id,customer_name,score,comment,date
0,1,Oliver Wilson,8,"The latte was great, but the music was a bit t...",2022-08-01
1,2,Emma Jones,9,The cappuccino was perfect and the atmosphere ...,2022-08-02
2,3,William Taylor,6,The coffee tasted burnt and the service was sl...,2022-08-03


## We can use the same chain to generate more rows (but beware of duplicates)

In [8]:
# Reset start id to continue where we left off
max_value = mock_df.iloc[:, 0].max()
start_id = max_value + 1
seed = random.randint(1, 100000)

# Generate a new set of rows
mock_data = data_generation_chain.run(data_type=data_type,
                                      business_type=business_type,
                                      data_output_fields=data_output_fields,
                                      start_id=start_id,
                                      num_rows_per_iteration=num_rows_per_iteration,
                                      seed=seed)

# Turn the rows into a dataframe
temp_df = pd.DataFrame(json.loads(mock_data))

# append to mock_df
mock_df = pd.concat([mock_df, temp_df], ignore_index=True)

In [9]:
mock_df

Unnamed: 0,id,customer_name,score,comment,date
0,1,Oliver Wilson,8,"The latte was great, but the music was a bit t...",2022-08-01
1,2,Emma Jones,9,The cappuccino was perfect and the atmosphere ...,2022-08-02
2,3,William Taylor,6,The coffee tasted burnt and the service was sl...,2022-08-03
3,4,Ethan Smith,8,"Great coffee, but the seating could be more co...",2022-05-01
4,5,Lila Rodriguez,9,I love the ambiance of this coffee shop! The m...,2022-05-02
5,6,Oliver Jackson,7,,2022-05-03


Note here that we can get duplicate names / comments as we haven't let the model know that those data fields have been generated already. 

Passing a seed seems to help with this

We can run it a few more times to see whether the seed truly has helped

In [None]:
num_iterations = 5

for iteration in range(num_iterations):
    # Reset start id to continue where we left off
    max_value = mock_df.iloc[:, 0].max() if not mock_df.empty else 0
    start_id = max_value + 1
    seed = random.randint(1, 100000)

    # Generate a new set of rows
    mock_data = data_generation_chain.run(data_type=data_type,
                                          business_type=business_type,
                                          data_output_fields=data_output_fields,
                                          start_id=start_id,
                                          num_rows_per_iteration=num_rows_per_iteration,
                                          seed=seed)

    # Turn the rows into a DataFrame
    temp_df = pd.DataFrame(json.loads(mock_data))

    # Append to mock_df
    mock_df = pd.concat([mock_df, temp_df], ignore_index=True)

In [None]:
mock_df