# Synthetic Data Generation


In [1]:
import json
import sys
import csv
sys.path.append('..')


import tinytroupe
from tinytroupe.agent import TinyPerson
from tinytroupe.environment import TinyWorld, TinySocialNetwork
from tinytroupe.factory import TinyPersonFactory
from tinytroupe.extraction import default_extractor as extractor
from tinytroupe.extraction import ResultsReducer
import tinytroupe.control as control


!!!!
DISCLAIMER: TinyTroupe relies on Artificial Intelligence (AI) models to generate content. 
The AI models are not perfect and may produce inappropriate or inacurate results. 
For any serious or consequential use, please review the generated content before using it.
!!!!

Looking for default config on: /home/dariast/agent_simulation/TinyTroupeOllama/examples/../tinytroupe/config.ini
Found custom config on: /home/dariast/agent_simulation/TinyTroupeOllama/examples/config.ini

Current TinyTroupe configuration 
[OpenAI]
api_type = ollama
azure_api_version = 2023-05-15
model = gpt-4o
max_tokens = 4000
temperature = 0.3
freq_penalty = 0.0
presence_penalty = 0.0
timeout = 60
max_attempts = 5
waiting_time = 1
exponential_backoff_factor = 5
embedding_model = text-embedding-3-small
cache_api_calls = False
cache_file_name = openai_api_cache.pickle
max_content_display_length = 1024

[Simulation]
rai_harmful_content_prevention = True
rai_copyright_infringement_prevention = True

[Logging]
logle

Let's create the specific types of agents we need to collect data.

In [2]:
factory = TinyPersonFactory("A random knowledge worker in a company providing marketing services.")

In [3]:
people = []
for i in range(2):
    person = factory.generate_person(temperature=1.6)
    print(person.minibio())
    people.append(person)

len(people)

Astrid is a 42 year old Marketing Specialist, French, currently living in France.
Lorenzo is a 29 year old Marketing Specialist, Italian, currently living in Italy.


2

In [5]:
company = TinyWorld("Some Corp Inc.", people)

In [6]:
company.make_everyone_accessible()

In [7]:
company.broadcast("Message each other to get work done.")

In [8]:
company.run(2)

KeyboardInterrupt: 

We can now extract the conversations, which form the synthetic corpus we wanted.

In [8]:
people[0].pp_current_interactions()

In [None]:
reducer = ResultsReducer()

def aux_extract_content(focus_agent: TinyPerson, source_agent:TinyPerson, target_agent:TinyPerson, kind:str, event: str, content: str, timestamp:str):

    if event == "TALK":
        author = focus_agent.name
    elif event == "CONVERSATION":
        if source_agent is None:
            author = "USER"
        else:
            author = source_agent.name
    else:
        raise ValueError(f"Unknown event: {event}")
    
    
    entry = (author, content)
    print(entry)
    return entry
    


reducer.add_reduction_rule("TALK", aux_extract_content)
reducer.add_reduction_rule("CONVERSATION", aux_extract_content)

Finally, we obtain the dataframe with the data and save it to a `.csv`, for later use in other applications.

In [10]:
df = reducer.reduce_agent_to_dataframe(people[0], column_names=["author", "content"])
df

('USER', 'Message each other to get work done.')
('Samantha', "Messaging can be a great tool for quick updates and clarifications. It's important to ensure everyone is on the same page and has access to the information they need.")
('Liam', "Absolutely, Samantha. Messaging helps us stay connected and informed, which is crucial for our team's success.")
('Samantha', 'I completely agree, Liam. Consistent communication through messaging can really streamline our processes and ensure everyone is aligned.')
('Liam', "I'm glad we're on the same page, Samantha. Let's continue to use messaging effectively to keep our team aligned and informed.")


Unnamed: 0,author,content
0,USER,Message each other to get work done.
1,Samantha,Messaging can be a great tool for quick update...
2,Liam,"Absolutely, Samantha. Messaging helps us stay ..."
3,Samantha,"I completely agree, Liam. Consistent communica..."
4,Liam,"I'm glad we're on the same page, Samantha. Let..."


In [11]:
df.to_csv("../data/extractions/synthetic_data_generation.out.csv", index=False)