In [1]:
import warnings
warnings.filterwarnings("ignore")
import re
import jsonlines
import nltk
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer

# Sentiment Analysis for Negative User Messages

Load file with conversations and keep only "negative" messages in a separate list.

**Example:**  
If a user says, "This is the worst service ever," it will be flagged as negative and the entire conversation will be collected.

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/oksana/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
threads_file = "openai_files/multi_turn_conversation_virgin.jsonl"

In [12]:
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# List to store conversations with negative user messages
conversations_with_negative_messages = []

with jsonlines.open(threads_file) as reader:  # Change "data.jsonl" to your filename
    for conversation in reader:
        negative_found = False
        # Check all messages in the conversation
        for message in conversation:
            if message["role"] == "user":
                sentiment_score = sia.polarity_scores(message["content"])["compound"]
                if sentiment_score < -0.3:  # Threshold for negativity
                    negative_found = True
                    break  # No need to check further messages once a negative message is found
        # If a negative message is found, add the entire conversation to the list
        if negative_found:
            conversations_with_negative_messages.append(conversation)

# Print out the conversations with negative user messages
print(f"Collected {len(conversations_with_negative_messages)} conversations with negative user messages.")

Collected 2781 conversations with negative user messages.


In [5]:
conversations_with_negative_messages[9:10]

[[{'role': 'system',
   'content': 'You are a polite customer assistant whose goal is to provide effective help.'},
  {'role': 'user',
   'content': 'On worst Train journey in long time from stockport to euston 943 total overcrowding and no declassification'},
  {'role': 'assistant',
   'content': 'Sorry to hear that Sarah have you spoken with any staff onboard'},
  {'role': 'user',
   'content': 'Poor show Train Manager is hiding in first class and wont come and speak to us'},
  {'role': 'user',
   'content': 'Yes and the Manager response to everyone is to leave the Train and get next We have business meetings as soon as arrive so cant'},
  {'role': 'assistant',
   'content': 'Please accept our apologies Sarah But it would be the train manager who would be best to speak to regarding this'},
  {'role': 'assistant',
   'content': 'I see sorry its so busy for you had you reserved a seat'},
  {'role': 'user', 'content': 'We would if didnt hide declassified'},
  {'role': 'assistant', 'cont

In [6]:
documents = []

# Iterate through conversations with negative messages
for conversation in conversations_with_negative_messages:
    concatenated_conversation = []
    
    for message in conversation:
        if message["role"] in ["user", "assistant"]:
            concatenated_conversation.append(message["content"])
    
    documents.append(" ".join(concatenated_conversation))

print(f"Prepared {len(documents)} documents.")

Prepared 2781 documents.


In [44]:
documents[0]

'so i wait almost 3 hours and then they are rude and arrogant amp unhelpful after which she is raising a technical case If youre unhappy with your experience on this call please contact us on our website'

In [46]:
def extract_ner_from_strings(strings_array):
    nlp = spacy.load('en_core_web_sm')    
    all_entities = []
    
    for text in strings_array:
        doc = nlp(text)
        
        for ent in doc.ents:
            all_entities.append((ent.text, ent.label_))
    
    return all_entities

In [48]:
ner = extract_ner_from_strings(documents)

In [55]:
ner[0:2]

[('almost 3 hours', 'TIME'), ('at least the third', 'DATE')]

# Replace NER Entities with Placeholders

This script processes conversations and replaces named entities (NER) in user and assistant messages with placeholders for further analysis.

**Example:**  
Original: "I traveled from London to Manchester."  
Replaced: "I traveled from [GPE] to [GPE]."

In [13]:
def replace_ner_with_placeholders(conversations):
    nlp = spacy.load('en_core_web_trf')
    
    def replace_entities(text):
        doc = nlp(text)
        for ent in doc.ents:
            placeholder = f"[{ent.label_}]"
            text = text.replace(ent.text, placeholder)
        return text
    
    processed_conversations = []
    for conversation in conversations:
        processed_conversation = []
        for message in conversation:
            if message["role"] in {"user", "assistant"}:
                new_message = message.copy()
                new_message["content"] = replace_entities(message["content"])
                processed_conversation.append(new_message)
            else:
                processed_conversation.append(message)
        processed_conversations.append(processed_conversation)
    
    return processed_conversations

In [14]:
conversations_ner_replaced = replace_ner_with_placeholders(conversations_with_negative_messages[0:20])

In [15]:
conversations_ner_replaced[1]

[{'role': 'system',
  'content': 'You are a polite customer assistant whose goal is to provide effective help.'},
 {'role': 'user',
  'content': 'For at least the [ORDINAL] time [DATE] I will miss the meeting I especially took the train for in the [ORDINAL] place So often late No refund because it doesnt qualify [DATE] car'},
 {'role': 'assistant',
  'content': 'Hi [PERSON] really sorry to hear this Which service are you travelling on please'},
 {'role': 'user',
  'content': 'The [PRODUCT] from [GPE] Now Im here waiting on the [PRODUCT] from [GPE] int Also late'},
 {'role': 'assistant',
  'content': 'Sadly the service has been delayed due to a fault on the line'},
 {'role': 'user',
  'content': 'Always some supposedly acceptable reason yes I get that but when one uses the trains [DATE] and spends a lot of money for it to be late regularly is just not acceptable'},
 {'role': 'assistant',
  'content': 'I understand the frustration this causes [PERSON]'},
 {'role': 'user',
  'content': 'O

In [19]:
conversations_with_negative_messages[2:3]

[[{'role': 'system',
   'content': 'You are a polite customer assistant whose goal is to provide effective help.'},
  {'role': 'user',
   'content': 'any chance you can do something about the 2 drunk ladies causing havoc on the 1747 from Liverpool lime street to London Euston Abusive to fellow passengers amp general behaviour leaving a lot to be desired'},
  {'role': 'assistant',
   'content': 'Really sorry to hear this Please do report this to the onboard team and they should be able to assist you further'},
  {'role': 'user', 'content': 'Where are they located on train'},
  {'role': 'assistant',
   'content': 'They should be making their way through during the journey You can also find the Train Manager in Coach C'},
  {'role': 'user',
   'content': 'Couldnt find anyone Very annoyed by their antics'},
  {'role': 'assistant',
   'content': 'Sorry to hear that the Train Manager should be making their way through'},
  {'role': 'user',
   'content': 'Noone came through before we got off 

In [17]:
conversations_ner_replaced = replace_ner_with_placeholders(conversations_with_negative_messages[0:1000])

In [18]:
with jsonlines.open("openai_files/multi_turn_conversation_virgin_ner_replaced.jsonl", mode='w') as writer:
    writer.write_all(conversations_ner_replaced)

In [25]:
def generate_qa_pairs(conversations):
    qa_pairs = []
    
    for conversation in conversations:
        user_message = None
        assistant_message = None
        
        for message in conversation:
            if message['role'] == 'user' and user_message is None:
                sentiment_score = sia.polarity_scores(message['content'])["compound"]
                if sentiment_score < -0.3:
                    user_message = message['content']
            elif message['role'] == 'assistant' and user_message is not None:
                assistant_message = message['content']
            
            if user_message and assistant_message:
                qa_pairs.append({
                    "question": user_message,
                    "expected_response": assistant_message
                })
                user_message = None
                assistant_message = None
    
    return qa_pairs

In [26]:
qa_pairs = generate_qa_pairs(conversations_with_negative_messages[1000:])

In [27]:
qa_pairs[0:10]

[{'question': 'handling customer service complaints witheven worse customer service',
  'expected_response': 'Hi Joshua this account has nothing to do with us please ignore them'},
 {'question': 'Hello have you got rid of the unreserved coaches FU The McrEuston trains I got at the weekend seemed to suggest so All coaches had reservations',
  'expected_response': 'No on 11 cars itll be on U and C on 9 cars it will be coach C'},
 {'question': 'Hey cancelling the 1310 and then not having a train manager for the 1330 not cool',
  'expected_response': 'Travelling from where'},
 {'question': 'On the 1247 from Lime Street to Euston the smell from the toilets is vile Smells like a turd covered in burnt hair',
  'expected_response': 'Sorry to hear that have you spoken to the staff on board regarding it'},
 {'question': '2017 A mad world',
  'expected_response': 'Which service is that Thomas'},
 {'question': 'booked advance tickets with 2together railcard One unable to travel now as father criti

In [28]:
with jsonlines.open("openai_files/qa_pairs_evaluation_set.jsonl", mode='w') as writer:
    writer.write_all(qa_pairs)