# Data Generation

We are going to use this repository to generate training data as mentioned in the document using OpenAI API and prompt engineering

In [51]:
# Importing libraries and keys
import openai
import yaml
from yaml.loader import SafeLoader
with open('data/env.yml') as f:
    data = yaml.load(f, Loader=SafeLoader)
openai.organization = data["OPEN_API_ORG"]
openai.api_key = data["OPENAI_API_KEY"]

In [47]:
def get_completion(prompt, model="gpt-3.5-turbo",temperature=0.0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [16]:
example_row = """
“01:00:00”,“01:00:20”||“Alice”||“Hello everyone!”
"01:00:00"||"01:01:20","Bob","Today we are going to discuss about overall product metrics"
"12:01:34","12:01:50","Tay","Awesome, thanks for informing about that!"
"""
prompt = f"""
Generate an audio transcript of a multi turn conversation. \
The generated text should have similar structure to the example delimited with triple backticks

Example Transcription: ```{example_row}```
The conversation would have only 5 turns.
"""
repsonse = get_completion(prompt)
print(repsonse)

```
"00:00:00","00:00:05","John","Hey guys, how's everyone doing today?"
"00:00:05","00:00:10","Sarah","I'm doing pretty well, thanks for asking."
"00:00:10","00:00:15","Mark","Same here, just trying to stay productive."
"00:00:15","00:00:20","Emily","I'm feeling a bit overwhelmed, to be honest."
"00:00:20","00:00:25","John","Oh no, what's going on Emily?"


So We can generate a good multi-turn conversation here and in the desired format, but the transcripts do not contain any good information or action items, that canc be extracted.<br>
So we will now try to modify the prompt to describe the environment and if we want something to be discussed in the conversation

In [19]:
example_row = """
“01:00:00”,“01:00:20”,“Alice”,“Hello everyone!”
"01:00:00","01:01:20","Bob","Today we are going to discuss about overall product metrics"
"12:01:34","12:01:50","Tay","Awesome, thanks for informing about that!"
"""
conv_env = "weekly standup meeting of an app development team"
conv_topics = "bugs and next steps."
prompt = f"""
Generate an audio transcript of a multi turn conversation of a {conv_env}  \
discussing about  {conv_topics} \
The generated text should have similar structure to the example delimited with triple backticks

Example Transcription: ```{example_row}```
The conversation would have only 5 turns.
"""
repsonse = get_completion(prompt)
print(repsonse)

```
"09:00:00","09:00:10","John","Good morning everyone, let's start the weekly standup meeting."
"09:00:10","09:00:20","Alice","Hi John, I have an update on the new feature we are working on. We have completed the design and are now moving to the development phase."
"09:00:20","09:00:30","John","Great to hear that Alice. Bob, do you have any updates on the product metrics?"
"09:00:30","09:00:40","Bob","Yes John, we have seen a 10% increase in user engagement since the last update. However, we still need to work on reducing the app's loading time."
"09:00:40","09:00:50","John","Thanks for the update Bob. Tay, have you found any new bugs in the app?"
"09:00:50","09:01:00","Tay","Yes John, we have identified a few bugs in the payment gateway. We are working on fixing them as soon as possible."
"09:01:00","09:01:10","John","Thanks for letting us know Tay. So, what are the next steps for the team?"
"09:01:10","09:01:20","Alice","We will continue working on the new feature and aim to comple

So above we can see that we can generate audio transcript pretty effectively for a given setting and also what type of things it should include.<br><br>
But still there is a lot of noise and right now we just want to train a model to find action items and link them to the person mentioned in the same text. We should  modify our prompt accordingly nad also we can create labels as well

In [5]:
conv_env = "weekly standup meeting of a tech company"
conv_topics = "progress,issues and next steps."
transcript = """
start_time,end_time,speaker,text,labels
“10:00:00”, “10:01:50”, “Bob“, “... Alice, can you take the UX bug? ...”,{"text": "UX bug", "assignee": "Alice"}""
“12:25:00”, “12:25:30”, “Alice”, ”... We need to plan for offsite next month ...”,"{"text": "plan for offsite next month", "assignee": "UNKNOWN"}"
"12:01:34","12:01:50","Tay","Awesome, thanks for informing about that!","{text:"","assignee":"N/A"}"
"""
prompt = f"""
Create a csv containing 10 rows of audio transcript of multi turn conversation of a {conv_env} \
discussing about  {conv_topics}\
The csv should have a similar struture to the example csv \
delimited with triple backticks.

Make sure each entry has a valid name.

Example Transcription: ```{transcript}```
"""
response = get_completion(prompt)
print(response)


start_time,end_time,speaker,text,labels
"09:00:00","09:01:30","John","Good morning everyone, let's start with the progress update.","{"text": "progress update", "assignee": "UNKNOWN"}"
"09:01:31","09:02:45","Sarah","I have completed the backend development for the new feature.","{"text": "backend development for new feature", "assignee": "UNKNOWN"}"
"09:02:46","09:03:30","John","Great job, Sarah. Any issues or roadblocks?","{"text": "issues or roadblocks", "assignee": "UNKNOWN"}"
"09:03:31","09:04:15","Sarah","No, everything went smoothly.","{"text": "N/A", "assignee": "N/A"}"
"09:04:16","09:05:30","Tom","I am facing some issues with the API integration. I need some help.","{"text": "API integration issues", "assignee": "UNKNOWN"}"
"09:05:31","09:06:15","John","Sure, I will assign someone to help you. Next steps?","{"text": "next steps", "assignee": "UNKNOWN"}"
"09:06:16","09:07:30","Alice","We need to finalize the UI design for the new feature.","{"text": "finalize UI design for new f

As of now it is able data in some form but is treating every action as as task and labeling it we need to maybe give it a few more examples of data, and also the LLM model is trying to make all the transcripts related to each other, but we are not concerned with that

In [7]:
conv_env = "weekly standup meeting of a dev team building a machine learning"
# conv_topics = "progress,issues and next steps."
records = """
start_time,end_time,speaker,text,labels
“10:00:00”, “10:01:50”, “Bob“, “Alice, can you take the UX bug? ”,{"text": "UX bug", "assignee": "Alice"}""
“12:25:00”, “12:25:30”, “Alice”, ”... We need to plan for offsite next month ...”,"{"text": "plan for offsite next month", "assignee": "UNKNOWN"}"
"12:01:34","12:01:50","Tay","Awesome, thanks for informing about that!","{text:"","assignee":"NONE"}"
"09:04:16","09:05:30","Tom","I am facing some issues with the API integration. I need some help.","{"text": "", "assignee": "NONE"}"
"09:04:16","09:05:30","Dan","Mathew can you finetune the model","{"text": "finetune the model", "assignee":"Mathew"}"
"16:00:00","16:01:30","Tony","We need to update the security protocols for the new release, Sam can you look into it?","{"text": "update the security protocols for the new release", "assignee": "Sam"}
"14:00:00","14:01:30","Emily","Jen can you please update the documentation for the new release","{"text": "update the documentation for the new release", "assignee": "Jen"}"
"""
inc_records = """
"09:00:00","09:01:30","John","Good morning everyone, let's start with the updates. Alice, how's the progress on the new feature?","{"text": "progress on the new feature", "assignee": "Alice"}"
"11:15:00","11:16:30","Sarah","I have completed the testing for the new API, it's ready for deployment.","{"text": "testing for the new API", "assignee": "UNKNOWN"}"

"""
prompt = f"""
Create a csv containing 10 rows of audio transcript of conversation of a {conv_env} \
All the rows are not required to be of the same conversation. \
The csv should have a similar struture to the example csv \
delimited with triple backticks.

Example Records: ```{records}```
Don't use the example records in the generated text
"""
response = get_completion(prompt)
print(response)

start_time,end_time,speaker,text,labels
"09:00:00","09:01:30","Tom","Good morning everyone, let's start the meeting","{"text": "start the meeting", "assignee": "UNKNOWN"}"
"09:02:00","09:03:20","Sarah","I have some updates on the data preprocessing, should I share my screen?","{"text": "share screen for data preprocessing updates", "assignee": "UNKNOWN"}"
"09:05:00","09:06:30","Mike","I noticed some anomalies in the dataset, should we remove them?","{"text": "remove anomalies from dataset", "assignee": "UNKNOWN"}"
"09:10:00","09:11:30","Jane","I have some suggestions on the model architecture, can we discuss it?","{"text": "discuss model architecture suggestions", "assignee": "UNKNOWN"}"
"09:15:00","09:16:30","David","I have completed the initial training of the model, should I share the results?","{"text": "share initial model training results", "assignee": "UNKNOWN"}"
"09:20:00","09:21:30","Karen","I have some concerns about the model's performance on the test set, can we investigate

In [5]:
conv_env = "meeting of a dev team building a machine learning model, team standup of the frontend team, weekly standup of marketing team, teacher giving tasks to students"
# conv_topics = "progress,issues and next steps."
records = """
“10:00:00”, “10:01:50”, “Bob“, “Alice, can you take the UX bug? ”,{"text": "UX bug", "assignee": "Alice"}""
"09:04:16","09:05:30","Dan","Mathew can you finetune the model","{"text": "finetune the model", "assignee":"Mathew"}"
"16:00:00","16:01:30","Tony","We need to update the security protocols for the new release, Sam can you look into it?","{"text": "update the security protocols for the new release", "assignee": "Sam"}
"14:00:00","14:01:30","Emily","Jen can you please update the documentation for the new release","{"text": "update the documentation for the new release", "assignee": "Jen"}"
"""
inc_records = """
"09:00:00","09:01:30","John","Good morning everyone, let's start with the updates. Alice, how's the progress on the new feature?","{"text": "progress on the new feature", "assignee": "Alice"}"
"11:15:00","11:16:30","Sarah","I have completed the testing for the new API, it's ready for deployment.","{"text": "testing for the new API", "assignee": "UNKNOWN"}"

"""
n = 20
prompt = f"""
Generate  {n} rows  for a csv dataset having the following columns: start_time, end_time, speaker, text, labels.
The data in the text column in each row is a part of a transcript of a multi-turn conversation. \
The conversation can be of any of the following {conv_env}. \           
The rows of the csv should have a similar struture to the example rows \
delimited with triple backticks.

Example Records: ```{records}```
Don't use the example records in the generated text
The data in text columns of each row should are not required to be related to each other. \
Try to use as many different conversations as possible. \
The data in the text column of the csv should have a part where one person is assigning a task to another person. \
Also the text should have the task being assigned mentioned clearly and name of the person to whom the task is being assigned. \
"""
response = get_completion(prompt)
print(response)

"09:30:00","09:31:20","Sarah","Hey John, can you please work on the login page design?","{"text": "work on the login page design", "assignee": "John"}"
"10:15:00","10:16:30","David","Hey guys, we need to fix the performance issue in the app, can someone take this up?","{"text": "fix the performance issue in the app", "assignee": "team"}"
"11:00:00","11:01:30","Rachel","Hey Tom, can you please create a new landing page for the website?","{"text": "create a new landing page for the website", "assignee": "Tom"}"
"14:30:00","14:31:50","Mark","Hey Lisa, can you please update the product roadmap for the next quarter?","{"text": "update the product roadmap for the next quarter", "assignee": "Lisa"}"
"15:00:00","15:01:30","Alex","Hey team, we need to finalize the marketing strategy for the new product launch, can someone take this up?","{"text": "finalize the marketing strategy for the new product launch", "assignee": "team"}"
"09:00:00","09:01:30","Emma","Hey Peter, can you please test the ne

In [8]:
conv_env = "standup meet of AI team"
# conv_topics = "progress,issues and next steps."
records = """
“10:00:00”, “10:01:50”, “Bob“, “Alice, can you take the UX bug? ”,{"text": "UX bug", "assignee": "Alice"}""
"09:04:16","09:05:30","Dan","Mathew can you finetune the model","{"text": "finetune the model", "assignee":"Mathew"}"
"16:00:00","16:01:30","Tony","We need to update the security protocols for the new release, Sam can you look into it?","{"text": "update the security protocols for the new release", "assignee": "Sam"}
"14:00:00","14:01:30","Emily","The current documentation does not look to be in sync with the latest version of the relase. Jen, can you please update the documentation for the new release","{"text": "update the documentation for the new release", "assignee": "Jen"}"
"""

n = 5
prompt = f"""
Generate  {n} rows  for a csv dataset having the following columns: start_time, end_time, speaker, text, labels.
The data in the text column in each row is a part of a transcript of a multi-turn conversation. \
The conversation can be of any of the following {conv_env}. \           
The rows of the csv should have a similar struture to the example rows \
delimited with triple backticks.

Example Records: ```{records}```
Don't use the example records in the generated text
The data in text columns of each row should are not required to be related to each other. \
Try to use as many different conversations as possible. \
The data in the text column of the csv should have a part where one person is assigning a task to another person. \
Also the text should have the task being assigned mentioned clearly and name of the person to whom the task is being assigned. \
"""
response = get_completion(prompt,temperature=0.7)
print(response)

KeyboardInterrupt: 

In [5]:
text = "The project manager opens the meeting by stating that they will address functional design and then going over the agenda. The industrial designer gives his presentation, explaining how remote controls function and giving personal preference to a clear, simple design that upgrades the technology as well as incorporates the latest features in chip design. The interface specialist gives her presentation next, addressing the main purpose of a remote control. She pinpoints the main functions of on/off, channel-switching, numbers for choosing particular channels, and volume; and also suggests adding a menu button to change settings such as brightness on the screen. She gives preference to a remote that is small, easy to use, and follows some conventions. The group briefly discusses the possibility of using an LCD screen if cost allows it, since it is fancy and fashionable. The marketing expert presents, giving statistical information from a survey of 100 subjects. She prefers a remote that is sleek, stylish, sophisticated, cool, beautiful, functional, solar-powered, has long battery life, and has a locator. They discuss the target group, deciding it should be 15-35 year olds. After they talk about features they might include, the project manager closes the meeting by allocating tasks."

prompt = f"""
Generate a transcription of a multiturn conversation between a team of five people \
using the following summary, which is delimited by \
triple backticks.
Format your Response as a  python list of dictionary \
and each dictionary has the following keys: \
start_time,end_time,speaker,text
Use names of people in the conversation like Alice, Bob.
Try to add texts in which some kind of task being assigned to another person
Summary : ```{text}```
"""
response = get_completion(prompt)
print(response)

[
    {
        "start_time": "00:00:00",
        "end_time": "00:00:10",
        "speaker": "Project Manager",
        "text": "Good morning everyone. Today, we will be addressing the functional design of the remote control. Let's start by going over the agenda."
    },
    {
        "start_time": "00:00:10",
        "end_time": "00:05:30",
        "speaker": "Industrial Designer",
        "text": "Thank you. As an industrial designer, I believe that the remote control should have a clear and simple design that upgrades the technology while incorporating the latest features in chip design. This will make it easier for the user to navigate and understand. "
    },
    {
        "start_time": "00:05:30",
        "end_time": "00:10:20",
        "speaker": "Interface Specialist",
        "text": "I agree with the industrial designer. The main purpose of a remote control is to turn the device on/off, switch channels, choose particular channels, and adjust the volume. I suggest adding a men

In [3]:
text = 

prompt = f"""
Your task is to recreate a multi-turn conversation between a team \
using the given summary of the conversation delimited by ```.
Format your Response as a  python list of dictionary \
and each dictionary has the following keys: \
start_time,end_time,speaker,text
The time should be in the format HH:MM:SS. \
The value in the speaker key have to be proper names like Alice, Harry, Simon. \
Summary : ```{text}```
"""
response = get_completion(prompt)
print(response)

[{'start_time': '00:00:00', 'end_time': '00:00:10', 'speaker': 'Project Manager', 'text': 'they will address functional design and then going over the agenda.'},
{'start_time': '00:00:10', 'end_time': '00:02:30', 'speaker': 'Industrial Designer', 'text': 'explaining how remote controls function and giving personal preference to a clear, simple design that upgrades the technology as well as incorporates the latest features in chip design.'},
{'start_time': '00:02:30', 'end_time': '00:05:20', 'speaker': 'Interface Specialist', 'text': 'addressing the main purpose of a remote control. She pinpoints the main functions of on/off, channel-switching, numbers for choosing particular channels, and volume; and also suggests adding a menu button to change settings such as brightness on the screen. She gives preference to a remote that is small, easy to use, and follows some conventions.'},
{'start_time': '00:05:20', 'end_time': '00:06:30', 'speaker': 'Group', 'text': 'discusses the possibility of

In [48]:
# scenario = "standup meet of AI team, where they discuss about the progress of the project."
scenario = "standup meet of a AI team, planning for the next sprint."
# conv_topics = "progress,issues and next steps."
# example = """
# "14:00:00","14:01:30","Emily","Amit can you please create a document for the new release",
# “10:00:00”, “10:01:50”, “Bob“, “The Testing team performed some tests on the latest realease and have found some issues in using the login page.Alice, can you take this UX bug? 
# "16:00:00","16:01:30","Tony","We need to update the security protocols for the new release, Sam can you look into it?",
# "11:00:00","11:01:30","Ken","The current documentation does not look to be in sync with the latest version of the relase. Jen, can you please update the documentation for the new release",
# """
example = """
“10:00:00”, “10:01:50”, “Bob“, “The QA team did some tests on the latest realease and have found some issues in using the login page.Alice, can you take this UX bug? 
"11:00:00","11:01:30","Ken","The current documentation does not look to be in sync with the latest version of the relase. Jen, can you please update the documentation for the new release",
"""

n = 10
prompt = f"""
You have to do the following task:
1 - Generate a transcript of a multi-turn conversation for the given scenario.
2 - The conversation should have at least {n} turns.
3 - The agenda of any of the conversations generated should have discussion of action items, which should get assigned to a person.
4 - Find all the action items discussed during the conversation and to whom it was assigned.
An Examples of how each turn of a conversation should look like is given below delimited by <>:
Example: <{example}>

Scenario: {scenario}
"""
response = get_completion(prompt,temperature=0.0)
print(response)

“09:00:00”, “09:01:30”, “John”, “Good morning everyone, let's start with our standup meet for the next sprint. Who wants to go first?”
“09:01:30”, “09:02:00”, “Sarah”, “I can go first. Last sprint, I worked on the chatbot's natural language processing and it's now ready for testing.”
“09:02:00”, “09:03:00”, “John”, “Great work, Sarah. What's your plan for this sprint?”
“09:03:00”, “09:04:00”, “Sarah”, “I plan to work on integrating the chatbot with our backend systems.”
“09:04:00”, “09:05:00”, “John”, “Sounds good. Anyone else?”
“09:05:00”, “09:06:00”, “Mike”, “I worked on the UI for the chatbot last sprint and it's ready for testing as well.”
“09:06:00”, “09:07:00”, “John”, “Awesome, Mike. What's your plan for this sprint?”
“09:07:00”, “09:08:00”, “Mike”, “I plan to work on improving the chatbot's error handling and adding more user-friendly error messages.”
“09:08:00”, “09:09:00”, “John”, “Great plan. Anyone else?”
“09:09:00”, “09:10:00”, “Jen”, “I worked on the backend systems last 

In [50]:
print(prompt)


You have to do the following task:
1 - Generate a transcript of a multi-turn conversation for the given scenario.
2 - The conversation should have at least 10 turns.
3 - The agenda of any of the conversations generated should have discussion of action items, which should get assigned to a person.
4 - Find all the action items discussed during the conversation and to whom it was assigned.
An Examples of how each turn of a conversation should look like is given below delimited by <>:
Example: <
“10:00:00”, “10:01:50”, “Bob“, “The QA team did some tests on the latest realease and have found some issues in using the login page.Alice, can you take this UX bug? 
"11:00:00","11:01:30","Ken","The current documentation does not look to be in sync with the latest version of the relase. Jen, can you please update the documentation for the new release",
>

Scenario: standup meet of a AI team, planning for the next sprint.



In [7]:
# conv_env = "standup meet of AI team, where they discuss about the progress of the project and the lead assigning tasks to each other."
conv_env = "standup meet of AI team, where they discuss about the progress of the project,."
# conv_topics = "progress,issues and next steps."
records = """
"14:00:00","14:01:30","Emily","Amit can you please create a document for the new release",
“10:00:00”, “10:01:50”, “Bob“, “The Testing team performed some tests on the latest realease and have found some issues in using the login page.Alice, can you take this UX bug? 
"16:00:00","16:01:30","Tony","We need to update the security protocols for the new release, Sam can you look into it?",
"11:00:00","11:01:30","Ken","The current documentation does not look to be in sync with the latest version of the relase. Jen, can you please update the documentation for the new release",
"""

n = 10
prompt = f"""
Your task is to generate a multi-turn conversation of a {conv_env}. \
The data in the text column in each row is a part of a transcript of a multi-turn conversation. \
Format your Response as a  python list of dictionary \
and each dictionary has the following keys: \
start_time,end_time,speaker,text
The time should be in the format HH:MM:SS. \
The value in the speaker key have to be proper names like Alice, Harry, Simon,etc. \
The result will have maximum of {n} rows. \
"""
response = get_completion(prompt,temperature=0.7)
print(response)

[
    {
        "start_time": "09:00:00",
        "end_time": "09:02:30",
        "speaker": "Alice",
        "text": "Good morning team, let's start the standup meeting."
    },
    {
        "start_time": "09:02:30",
        "end_time": "09:04:00",
        "speaker": "Harry",
        "text": "Morning Alice, how's the progress of the project?"
    },
    {
        "start_time": "09:04:00",
        "end_time": "09:06:00",
        "speaker": "Alice",
        "text": "We're making good progress, but there are still some tasks that need to be assigned. Simon, can you take the lead on the NLP module?"
    },
    {
        "start_time": "09:06:00",
        "end_time": "09:08:00",
        "speaker": "Simon",
        "text": "Sure Alice, I'll take care of it."
    },
    {
        "start_time": "09:08:00",
        "end_time": "09:10:00",
        "speaker": "Alice",
        "text": "Great, Harry can you work on the data preprocessing module?"
    },
    {
        "start_time": "09:10:00",
    

In [3]:
prompt = """
Generate a list of teams required to build an ecommerce website.
"""
response = get_completion(prompt,temperature=0.7)
print(response)

1. Project Manager
2. UX/UI Designer
3. Front-end Developer
4. Back-end Developer
5. Database Administrator
6. Quality Assurance Tester
7. Content Writer
8. Graphic Designer
9. Social Media Strategist
10. SEO Specialist
11. Marketing Manager
12. Customer Support Representative
13. Payment Gateway Integration Specialist
14. Shipping and Logistics Coordinator.


In [15]:
for obj in data:
    prompt = f"""
Find the span which describes the task and name of the person to which the task is being assigned in the text \
dellimited by <>.
Format your response as a JSON object with \
"task" and "assignee" as the keys. 
If a span containing a task isn't present, use "" \
as the value, and if the person to whom task is being \
assigned is not clear or is not mnetioned use "UNKNOWN" as the value.
Text : <{obj['text']}>
"""
    response = get_completion(prompt)
    print(obj['text'],response)
    obj['lables'] = response

Good morning team, let's start the standup meeting. {
  "task": "",
  "assignee": "UNKNOWN"
}
Morning Alice, how's the progress of the project? {
  "task": "",
  "assignee": "Alice"
}
We're making good progress, but there are still some tasks that need to be assigned. Simon, can you take the lead on the NLP module? {
  "task": "take the lead on the NLP module",
  "assignee": "Simon"
}
Sure Alice, I'll take care of it. {
  "task": "take care of it",
  "assignee": "Alice"
}
Great, Harry can you work on the data preprocessing module? {
  "task": "work on the data preprocessing module",
  "assignee": "Harry"
}
Yes, I can take care of that. {
  "task": "take care of that",
  "assignee": "I"
}
And finally, I'll work on the model training module. Any questions or concerns? {
  "task": "work on the model training module",
  "assignee": "UNKNOWN"
}
No, everything seems good to me. {
  "task": "",
  "assignee": "UNKNOWN"
}
Same here, let's get to work. {
  "task": "",
  "assignee": "UNKNOWN"
}
A

In [25]:
example = """
<Input> : Good morning team, let's start the standup meeting 
<Output>:  {"task": "","assignee": "None"}
<Input> : Great, Harry can you work on the data preprocessing module? 
<Output>: {"task": "work on the data preprocessing module","assignee": "Harry"}
<Input>: Yes, I can take care of that. 
<Output>: {"task": "","assignee": "None"}
<Input>: Agreed, let's have another meeting tomorrow to discuss the progress. 
<Output>: {"task": "have another meeting tomorrow to discuss the progress","assignee": "UNKNOWN"}

"""
for obj in data:
    prompt = f"""
Your task is to perform the following actions:
1 - Find the span of text describing a task explicitly in the given
piece of text delimited by <>.
2 - Find the name of the person to whom the task is being assigned in the text \
dellimited by <>.
3 - Output a JSON object that contains the following keys: "text", "assignee"
Use the following format:
"text": <span of text describing a task>
"assignee" : <name of the person to whom the task is being assigned>

Some example pairs of input and output are given below delimited by ```:

Examples : ```{example}```

<Input>: <{obj['text']}>
"""
    response = get_completion(prompt)
    print(obj['text'],response)
    obj['lables'] = response

KeyboardInterrupt: 

In [20]:
import pandas as pd
df = pd.read_csv("data.csv")
df

Unnamed: 0,start_time,end_time,speaker,text
0,10:00:00,10:01:30,Bob,"Good morning everyone, let's start with our st..."
1,10:01:30,10:02:30,Alice,"Good morning Bob, I have been working on the c..."
2,10:02:30,10:03:30,Ken,"That's great Alice, have you faced any challen..."
3,10:03:30,10:04:30,Alice,"Yes, I am facing some issues with the chatbot'..."
4,10:04:30,10:05:30,Jen,"I can help you with that Alice, I have some ex..."
5,10:05:30,10:06:30,Bob,"Great, Jen can you take that action item and w..."
6,10:06:30,10:07:30,Jen,"Sure Bob, I will work with Alice on that."
7,10:07:30,10:08:30,Bob,"Moving on, has anyone else made any progress o..."
8,10:08:30,10:09:30,Tom,"Yes, I have been working on the AI model for i..."
9,10:09:30,10:10:30,Bob,"That's great Tom, keep up the good work."


In [2]:
from datasets import load_dataset

# If the dataset is gated/private, make sure you have run huggingface-cli login
access_token="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"
dataset = load_dataset("knkarthick/AMI",use_auth_token=access_token)

Found cached dataset csv (/home/codespace/.cache/huggingface/datasets/knkarthick___csv/knkarthick--AMI-e0c25640a9a75f54/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
summary = dataset['train'][150]['summary']

In [None]:
example = """
10:00:00||10:01:50||Bob||The QA team did some tests on the latest realease and have found some issues in using the login page.Alice, can you take this UX bug? 
11:00:00||11:01:30||Nicole||The current documentation does not look to be in sync with the latest version of the relase. Jen, can you please update the documentation for the new release.
11:01:40||11:02:30||Walter||And finally, I'll work on the model training module. Any questions or concerns?
11:02:40||11:03:30||Ivan||We need to plan for offsite next month
[
{"text": "UX bug", "assignee": "Alice"},
{"text": "update the documentation for the new release", "assignee": "Jen"}
{"text": "model training module", "assignee": "Walter"}
{"text": "plan for offsite next month", "assignee": "Unkown"}

]

"""
prompt = f"""
Your task is to perform the following actions:
1 - Generate a transcript of a multi-turn conversation for the given summary \
delimited by <>.
2 - Generate names for the people in the conversation if not given in the summary.
3 - Use different names for speakers in the conversation, than the one given in the examples.
4 - Also generate a python list of json, of the action items you can extract from the conversation and to what person they are assigned to.
Format each json object as follows:
"text": <action item>
"assignee" : <name of the person to whom the task is being assigned>
An Examples of how each the generated text should look like is given below delimited by ```
Example: ```{example}```

Summary : <{summary}>
"""
response = get_completion(prompt)
print(response)

In [39]:
# example = """
# Scenario: meeting of an AI team
# 10:00:00||10:01:50||Bob||The QA team did some tests on the latest realease and have found some issues in using the login page.Alice, can you take this UX bug? 
# 11:00:00||11:01:30||Nicole||The current documentation does not look to be in sync with the latest version of the relase. Jen, can you please update the documentation for the new release.
# 11:02:40||11:03:30||Ivan||Can someone please resolve the data pipline?
# """
example = """
Scenario: standup meet of backend team for a payment gateway.
Participants: John, Sarah, David, Emily
Turns: 10
10:00:00||10:01:50||John||Good morning everyone, let's start with our daily standup meeting.
10:02:00||10:03:30||Sarah||I have been working on the transaction module and have completed the integration with the payment gateway. 
10:03:40||10:05:00||David||I have been working on the user authentication module and have found some issues with the password reset functionality. 
10:05:10||10:06:30||John||David can you please create a ticket for this issue and assign it to yourself?
10:06:40||10:08:00||David||Sure John, I will do that.
10:08:10||10:09:30||Emily||I have been working on the account balance module and have found some performance issues. 
10:09:40||10:11:00||John||Emily can you please investigate the issue and provide a solution by the end of the day? And Sarah please ghelp Emily if she has any issues.
10:11:10||10:12:30||Emily||Sure John, I will do that.
10:12:40||10:14:00||Sarah||I have also found some issues with the transaction history module. 
10:14:10||10:15:30||John||Sure Sarah, I can pick that up? And can someone please create minutes of the meetings.
Action Items:
password reset functionality - David
account balance module - Emily, Sarah
transaction history module - John
create minutes of the meetings - UNKOWN
"""
# Action Items:
# UX bug - Alice
# update the documentation for the new release - Jen
# resolve the data pipline - UNKNOWN
# """
# 11:01:40||11:02:30||Walter||And finally, I'll work on the model training module. Any questions or concerns?
# 11:02:40||11:03:30||Ivan||We need to plan for offsite next month

# [
# {"text": "UX bug", "assignee": "Alice"},
# {"text": "update the documentation for the new release", "assignee": "Jen"}
# {"text": "model training module", "assignee": "Walter"}
# {"text": "plan for offsite next month", "assignee": "Unkown"}

# ]

# Format each json object as follows:
# "text": <action item>
# "assignee" : <name of the person to whom the task is being assigned>

scenario = "Design review meeting in a Car Company"
n = 15
prompt = f"""
Answer in a consistent manner 
{example}

Scenario: {scenario}
Participants : Mike, Debal, Daniyal, Najmi, Joe
Turns: {n}
"""
response = get_completion(prompt)
print(response)

10:00:00||10:01:50||Mike||Good morning everyone, let's start with our design review meeting.
10:02:00||10:03:30||Debal||I have been working on the exterior design of the new car model and have made some changes to the front grille.
10:03:40||10:05:00||Daniyal||I have been working on the interior design and have finalized the dashboard layout.
10:05:10||10:06:30||Najmi||I have been working on the engine specifications and have made some improvements to the fuel efficiency.
10:06:40||10:08:00||Joe||I have been working on the safety features and have added a new airbag system.
10:08:10||10:09:30||Mike||Great work everyone. Debal, can you please share the updated design with the team?
10:09:40||10:11:00||Debal||Sure Mike, I will do that.
10:11:10||10:12:30||Mike||Daniyal, can you please share the dashboard layout with the team as well?
10:12:40||10:14:00||Daniyal||Yes, Mike. I will share it with everyone.
10:14:10||10:15:30||Mike||Najmi, can you please provide more details on the improveme

In [9]:
n = 5
scene = "AI first software company"
example = """

"""
prompt = f"""
Generate {n} Scenarios for which a group of people would meet in a {scene}.
Keep it short
"""
response = get_completion(prompt)
print(response)

1. Brainstorming session for developing a new AI-powered product.
2. Team meeting to discuss progress and challenges in implementing machine learning algorithms.
3. Presentation of a new AI-based solution to potential clients.
4. Training session for employees on how to use the company's AI software.
5. Hackathon to develop innovative AI applications for the company's platform.


In [19]:
import re
responses = response.split('\n')
responses = [re.sub(r'\d+\.\s+','',x) for x in responses]
responses

['Brainstorming session for developing a new AI-powered product.',
 'Team meeting to discuss progress and challenges in implementing machine learning algorithms.',
 'Presentation of a new AI-based solution to potential clients.',
 "Training session for employees on how to use the company's AI software.",
 "Hackathon to develop innovative AI applications for the company's platform."]

In [34]:
example = """Scenario: standup meet of backend team for a payment gateway.
Participants: John, Sarah, David, Emily
Turns: 10
10:00:00||10:01:50||John||Good morning everyone, let's start with our daily standup meeting.
10:02:00||10:03:30||Sarah||I have been working on the transaction module and have completed the integration with the payment gateway. 
10:03:40||10:05:00||David||I have been working on the user authentication module and have found some issues with the password reset functionality. 
10:05:10||10:06:30||John||David can you please create a ticket for this issue and assign it to yourself?
10:06:40||10:08:00||David||Sure John, I will do that.
10:08:10||10:09:30||Emily||I have been working on the account balance module and have found some performance issues. 
10:09:40||10:11:00||John||Emily can you please investigate the issue and provide a solution by the end of the day? And Sarah please ghelp Emily if she has any issues.
10:11:10||10:12:30||Emily||Sure John, I will do that.
10:12:40||10:14:00||Sarah||I have also found some issues with the transaction history module. 
10:14:10||10:15:30||John||Sure Sarah, I can pick that up? And can someone please create minutes of the meetings.
Action Items:
password reset functionality||David
account balance module||Emily, Sarah
transaction history module||John
create minutes of the meetings||UNKOWN
"""
scenario = f"{responses[0]},{scene}"
n = 10
prompt = f"""
{example}

Scenario: {scenario}
Participants : Mike, Debal
Turns: {n}
"""
response = get_completion(prompt)
print(response)

10:00:00||10:01:50||Mike||Good morning Debal, let's start with our brainstorming session for developing a new AI-powered product.
10:02:00||10:03:30||Debal||Sure Mike, I was thinking about developing an AI-powered chatbot for customer service.
10:03:40||10:05:00||Mike||That's a great idea Debal. We can use natural language processing to make the chatbot more efficient.
10:05:10||10:06:30||Debal||Yes, and we can also use machine learning to train the chatbot to understand customer queries better.
10:06:40||10:08:00||Mike||We can also integrate the chatbot with our existing CRM system to provide personalized responses to customers.
10:08:10||10:09:30||Debal||That's a good point Mike. We can also use sentiment analysis to understand the customer's mood and respond accordingly.
10:09:40||10:11:00||Mike||We can also add a voice recognition feature to the chatbot to make it more accessible for visually impaired customers.
10:11:10||10:12:30||Debal||That's a great idea Mike. We can also use t

In [41]:
dialogue, action_items = response.split('\nAction Items:\n')

In [44]:
for item in action_items.split('\n'):
    x,y = item.split('||')
    print(x,y)

Develop AI-powered chatbot for customer service Mike, Debal
Integrate chatbot with existing CRM system Mike
Add voice recognition feature to the chatbot Debal
Use chatbot to collect customer feedback Mike, Debal
Use chatbot to cross-sell and upsell products Mike
Provide product recommendations based on purchase history Debal


In [40]:
x,y = 'Develop AI-powered chatbot for customer service||Mike, Debal'.split('||')
x,y

('Develop AI-powered chatbot for customer service', 'Mike, Debal')

In [26]:
example

"\nScenario: standup meet of backend team for a payment gateway.\nParticipants: John, Sarah, David, Emily\nTurns: 10\n10:00:00||10:01:50||John||Good morning everyone, let's start with our daily standup meeting.\n10:02:00||10:03:30||Sarah||I have been working on the transaction module and have completed the integration with the payment gateway. \n10:03:40||10:05:00||David||I have been working on the user authentication module and have found some issues with the password reset functionality. \n10:05:10||10:06:30||John||David can you please create a ticket for this issue and assign it to yourself?\n10:06:40||10:08:00||David||Sure John, I will do that.\n10:08:10||10:09:30||Emily||I have been working on the account balance module and have found some performance issues. \n10:09:40||10:11:00||John||Emily can you please investigate the issue and provide a solution by the end of the day? And Sarah please ghelp Emily if she has any issues.\n10:11:10||10:12:30||Emily||Sure John, I will do that.\n

In [52]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [59]:
result = "Action Items:\npassword reset functionality - David\naccount balance module - Emily, Sarah\ntransaction history module - John\ncreate minutes of the meetings - UNKOWN"
action_list = result.split('\n')[1:]
action_list
action_list = [{'text':x.split(' - ')[0],'assignee':x.split(' - ')[1]} for x in action_list]
action_list

[{'text': 'password reset functionality', 'assignee': 'David'},
 {'text': 'account balance module', 'assignee': 'Emily, Sarah'},
 {'text': 'transaction history module', 'assignee': 'John'},
 {'text': 'create minutes of the meetings', 'assignee': 'UNKOWN'}]

In [60]:
import json
with open('result.json','r') as f:
    pred = json.load(f)

In [65]:
# for item in action_list:
#     item['embed'] = get_embedding(item['text'])

for item in pred:
    item['embed'] = get_embedding(item['text'])

In [83]:
import torch
def cos_sim(a, b):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

true_val = [x['embed'] for x in action_list]
pred_val = [x['embed'] for x in pred]

scores = cos_sim(true_val,pred_val)

In [102]:
scores

tensor([[0.8681, 0.7324, 0.7295, 0.7194, 0.7420],
        [0.7421, 0.9002, 0.8216, 0.7384, 0.7759],
        [0.7186, 0.8128, 0.9231, 0.7366, 0.7653],
        [0.7594, 0.7292, 0.7492, 0.9885, 0.7538]])

In [100]:
from fuzzywuzzy import fuzz
for i,item in enumerate(action_list):
    match_idx = torch.argmax(scores[i]).item()
    if fuzz.partial_ratio(item['assignee'],pred[match_idx]['assignee']) == 100:
        print(item['text'],pred[match_idx]['text'], item['assignee'],pred[match_idx]['assignee'])
    else:
        print(fuzz.partial_ratio(item['assignee'],pred[match_idx]['assignee']),item['assignee'],pred[match_idx]['assignee'])



password reset functionality Create ticket for password reset functionality David David
account balance module Investigate account balance module performance issues Emily, Sarah Emily
29 John Sarah
33 UNKOWN UNCreate security report


In [96]:


fuzz.partial_ratio('Emily', 'Emily, Sarah')

100

In [68]:
torch.Tensor(action_list[0]['embed']).shape

torch.Size([1536])

In [69]:
torch.Tensor(action_list[0]['embed'])

tensor([-0.0158, -0.0191, -0.0262,  ...,  0.0261, -0.0089,  0.0051])

In [104]:
a = [10,20,30,40]
tuple(x/10 for x in a)

(1.0, 2.0, 3.0, 4.0)

In [105]:
def evaluate(action_true,action_pred):
    
    embed_1 = [get_embedding(item['text']) for item in action_true]
    embed_2 = [get_embedding(item['text']) for item in action_pred]

    scores = cos_sim(embed_1,embed_2)
    top_idx = torch.argmax(scores,dim=1)
    exact_match = 0
    wrong_assignee = 0
    not_found = 0
    extra_generated = len(action_true) - len(action_pred)
    for i,idx in enumerate(top_idx):
        if scores[i][idx] > 0.85:
            if fuzz.partial_ratio(action_true[i]['assignee'],action_pred[idx]['assignee']) == 100:
                exact_match += 1
            else:
                wrong_assignee += 1
        else:
            not_found += 1
    metrics = [exact_match,wrong_assignee,not_found]
    metrics = [x/len(action_true) for x in metrics] [extra_generated]
    return tuple(metrics)

