# Minimal edits to get started with data processing

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import random

In [2]:
df = pd.read_csv("assignment.csv")
print(len(df))

df = df.rename(columns={"conversation_text": "text", "conversation_tag": "tag"})

df.head()

2347


Unnamed: 0,text,tag
0,"Agent: Hi, this is Kevin from Holland and Barr...",store closing hours
1,"Agent: Good morning, Holland and Barrett custo...",repeat purchases
2,"Agent: Hi, thank you for contacting Holland an...",supplement advice
3,"Agent: Hello, Holland & Barrett customer suppo...",staff interaction
4,Agent: Thank you for calling Holland & Barrett...,hair products


In [3]:
# Adding an id to the column for ease of processing
df["id"] = range(len(df))
df = df[["id", "text", "tag"]]

In [4]:
df.to_csv("data/data-v1.csv", index=False)

# Data Enrichment

## Final sentiment of the user

In [1]:
import pandas as pd
import google.generativeai as genai
import os
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [2]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")

In [3]:
df = pd.read_csv("data/data-v1.csv")
print(len(df))
df.head()

2347


Unnamed: 0,id,text,tag
0,0,"Agent: Hi, this is Kevin from Holland and Barr...",store closing hours
1,1,"Agent: Good morning, Holland and Barrett custo...",repeat purchases
2,2,"Agent: Hi, thank you for contacting Holland an...",supplement advice
3,3,"Agent: Hello, Holland & Barrett customer suppo...",staff interaction
4,4,Agent: Thank you for calling Holland & Barrett...,hair products


In [4]:
def get_user_sentiment(text):
    prompt = f"Given the following conversation between a customer and a support agent classify the user's final sentiment as Positive, Negative or Neutal. If a classification can't be made then state Not Enough information. Only response with these four options. \nText: {text}"
    response = model.generate_content(prompt)
    return response.text.strip()

In [5]:
user_final_sentiments = Parallel(n_jobs=16, backend="loky")(delayed(get_user_sentiment)(x) for x in tqdm(df.text))

  0%|          | 0/2347 [00:00<?, ?it/s]

In [6]:
df["user_final_sentiment"] = user_final_sentiments

In [7]:
df_feature = df[["id", "user_final_sentiment"]]
df_feature = df_feature.rename(columns={"id": "conversation_id"})

In [8]:
df_feature.to_csv("data/sentiment.csv", index=False)

## Conversation distilled down to Key Issues and Key Resolution

In [1]:
import pandas as pd
import google.generativeai as genai
import os
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [2]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")

In [3]:
df = pd.read_csv("data/data-v1.csv")
print(len(df))
df.head()

2347


Unnamed: 0,id,text,tag
0,0,"Agent: Hi, this is Kevin from Holland and Barr...",store closing hours
1,1,"Agent: Good morning, Holland and Barrett custo...",repeat purchases
2,2,"Agent: Hi, thank you for contacting Holland an...",supplement advice
3,3,"Agent: Hello, Holland & Barrett customer suppo...",staff interaction
4,4,Agent: Thank you for calling Holland & Barrett...,hair products


In [6]:
def get_key_issue(text):
    prompt = f"Given the following conversation between a customer and a support agent, distill down the key issue that the user had into very few words. If the user didn't have any issues, specify the reason for contacting support staff.\n{text}"
    response = model.generate_content(prompt)
    return response.text.strip()

def get_key_resolution(text):
    prompt = f"Given the following conversation between a customer and a support agent, distill down the key resolution provided by the agent into very few words. If the user didn't have any issues, specify the reason for contacting support staff. If resolution couldn't be provided, state the reason for that\n{text}"
    response = model.generate_content(prompt)
    return response.text.strip()

In [7]:
key_issues = Parallel(n_jobs=8, backend="loky")(delayed(get_key_issue)(x) for x in tqdm(df.text))

  0%|          | 0/2347 [00:00<?, ?it/s]

In [9]:
key_resolutions = Parallel(n_jobs=8, backend="loky")(delayed(get_key_resolution)(x) for x in tqdm(df.text))

  0%|          | 0/2347 [00:00<?, ?it/s]

In [10]:
key_issues[:10]

['Store unexpectedly closed.',
 'Missing order confirmation email.',
 'Seeking supplement advice for anxiety.',
 'Rude and unhelpful staff.',
 'Vegetarian suitability of hair loss capsules.',
 'Incorrect delivery location.',
 'Positive feedback; no issue.',
 'Narrow store aisles.',
 'Click and collect order unfulfilled due to store closure.',
 'Store closed early, preventing order pickup.']

In [11]:
key_resolutions[:10]

['Store closure due to power outage.',
 'Order confirmed; email confirmation missing due to system error.',
 'Supplement recommendations for anxiety: 5-HTP and Magnesium.',
 'Feedback on rude staff relayed to store manager.',
 'Confirmed vegetarian suitability of product.',
 'Re-arranged home delivery for tomorrow.',
 'Positive feedback; no issue resolution needed.',
 'Feedback relayed to Stowmarket management.',
 'Order redelivered at no extra cost due to store closure.',
 'Home delivery arranged for order.']

In [12]:
df["key_issue"] = key_issues
df["key_resolution"] = key_resolutions

In [13]:
df_distilled = df[["id", "key_issue", "key_resolution"]]
df_distilled = df_distilled.rename(columns={"id": "conversation_id"})

In [14]:
df_distilled.to_csv("data/distilled.csv", index=False)

## Tag Refinement

### Generation
Have to perform the generation and assignment separately because of a bug caused by hugingface_hub used by BERTopic 

In [1]:
import pandas as pd
import google.generativeai as genai
import os
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from bertopic import BERTopic
import numpy as np

In [2]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")

In [3]:
df = pd.read_csv("data/data-v1.csv")
print(len(df))
df.head()

2347


Unnamed: 0,id,text,tag
0,0,"Agent: Hi, this is Kevin from Holland and Barr...",store closing hours
1,1,"Agent: Good morning, Holland and Barrett custo...",repeat purchases
2,2,"Agent: Hi, thank you for contacting Holland an...",supplement advice
3,3,"Agent: Hello, Holland & Barrett customer suppo...",staff interaction
4,4,Agent: Thank you for calling Holland & Barrett...,hair products


In [4]:
topic_model = BERTopic()

In [6]:
%%time
topics, probs = topic_model.fit_transform(df.text)
topics = np.array(topics)

df_topics = topic_model.get_topic_info().iloc[1:]
len(topics), len(df_topics)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


CPU times: user 2min 18s, sys: 23.5 s, total: 2min 42s
Wall time: 45.3 s


(2347, 61)

In [7]:
def get_topic_name(generated_name, sample_docs):
    prompt = f"For the given generated topic name from a topic clustering system and sample docs from that topic, suggest one short topic name. \nGenerated Name: {generated_name} \n Sample Docs: {''.join(sample_docs)}"
    response = model.generate_content(prompt)
    return response.text.strip()

In [8]:
topic_names = []
for i in tqdm(range(len(df_topics))):
    topic_id, topic_name = int(df_topics.Topic.iloc[i]), df_topics.Name.iloc[i]
    topic_names.append(
            get_topic_name(
                topic_name, 
                topic_model.get_representative_docs(topic_id)
            )
        )

  0%|          | 0/61 [00:00<?, ?it/s]

In [9]:
with open("data/topic-names.txt", 'w') as f:
    f.write("\n".join(topic_names))

In [11]:
df["topic_id"] = topics

In [12]:
df_topics = df[["id", "topic_id"]]
df_topics = df_topics.rename(columns={"id": "conversation_id"})

In [13]:
df_topics.to_csv("data/topics.csv", index=False)

### Assignment

In [3]:
import pandas as pd
import google.generativeai as genai
import os
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import numpy as np


In [4]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")

In [42]:
df = pd.read_csv("data/data-v1.csv")
print(len(df))
df.head()

2347


Unnamed: 0,id,text,tag
0,0,"Agent: Hi, this is Kevin from Holland and Barr...",store closing hours
1,1,"Agent: Good morning, Holland and Barrett custo...",repeat purchases
2,2,"Agent: Hi, thank you for contacting Holland an...",supplement advice
3,3,"Agent: Hello, Holland & Barrett customer suppo...",staff interaction
4,4,Agent: Thank you for calling Holland & Barrett...,hair products


In [6]:
with open("data/topic-names.txt") as f:
    topic_names = f.read().split('\n')

len(topic_names)

61

In [7]:
df_topic_ids = pd.read_csv("data/topics.csv")
print(len(df_topic_ids))
df_topic_ids.head()

2347


Unnamed: 0,conversation_id,topic_id
0,0,20
1,1,-1
2,2,59
3,3,48
4,4,35


In [8]:
topics = df_topic_ids.topic_id.tolist()
len(topics)

2347

In [15]:
def get_updated_tag(text, old_tag, topic_names, topic_id):
    if topic_id == -1:
        return old_tag
    prompt = f"Given the following conversation between a customer and support agent: {text}\n Choose the better fit as a tag for the conversation between {old_tag} and {topic_names[topic_id]}. Only response with the selected answer"
    response = model.generate_content(prompt)
    return response.text.strip()

In [20]:
final_tags = Parallel(n_jobs=16, backend="loky")(
    delayed(get_updated_tag)(
        df.iloc[i].text, 
        df.iloc[i].tag, 
        topic_names, 
        topics[i]
    ) for i in tqdm(range(len(df)))
)

  0%|          | 0/2347 [00:00<?, ?it/s]

In [44]:
df["updated_tag"] = final_tags

In [45]:
df_updated_tag = df[["id", "tag"]]
df_updated_tag = df_updated_tag.rename(columns={"id": "conversation_id", "updated_tag": "tag"})

In [23]:
df_updated_tag.to_csv("data/tag-updated.csv", index=False)