In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')

sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
salutations = [
    "good morning", "good afternoon", "good evening", "hello", "hi",
    "have a good day", "good night", "goodbye", "bye", "see you", "take care"
]

In [4]:
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def categorize_dialogues(dialogues):
    categories = {
        'Agent': {'concern': [], 'positive': [], 'neutral': []},
        'Customer': {'concern': [], 'positive': [], 'neutral': []}
    }
   
    for dialogue in dialogues:
        sentences = sent_tokenize(dialogue['text'])
        person = dialogue['person']
        dialogue_embedding = get_sentence_embedding(dialogue['text']).reshape(1, -1)  # 2D array
       
        for sentence in sentences:
            sentence_lower = sentence.lower()
            if any(salutation in sentence_lower for salutation in salutations):
                categories[person]['neutral'].append(sentence)
                continue
           
            sentiment = sentiment_pipeline(sentence)[0]
            if sentiment['label'] == 'NEGATIVE':
                categories[person]['concern'].append(sentence)
            elif sentiment['label'] == 'POSITIVE':
                categories[person]['positive'].append(sentence)
            else:
                categories[person]['neutral'].append(sentence)
   
    return categories

def extract_topics(categories):
    topics = {}
    for person, sentiments in categories.items():
        topics[person] = {}
        for category, sentences in sentiments.items():
            if not sentences or category == 'neutral': 
                continue
           
            embeddings = np.vstack([get_sentence_embedding(sent) for sent in sentences])
            avg_embedding = np.mean(embeddings, axis=0).reshape(1, -1) 
           
            similarity_scores = cosine_similarity(embeddings, avg_embedding).flatten()
            top_indices = similarity_scores.argsort()[-3:][::-1]  
            topics[person][category] = [sentences[i] for i in top_indices]

    return topics

def parse_dialogues(text):
    lines = text.split('\n')
    dialogues = []
    current_speaker = None
    current_text = []

    for line in lines:
        if line.strip() == '':
            continue
        if ':' in line:
            if current_speaker:
                dialogues.append({'person': current_speaker, 'text': ' '.join(current_text)})
            current_speaker, dialogue = line.split(':', 1)
            current_speaker = current_speaker.strip()
            current_text = [dialogue.strip()]
        else:
            current_text.append(line.
strip())


    if current_speaker:
        dialogues.append({'person': current_speaker, 'text': ' '.join(current_text)})

    return dialogues

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [5]:
file_path = 'life_insurance_conversation.txt'

text_content = read_text_file(file_path)

In [6]:
dialogues = parse_dialogues(text_content)
categorized = categorize_dialogues(
dialogues)

topics = extract_topics(categorized)

In [7]:
print("\nAgent's Positive Topics Represented by Sentences:")
for sentence in topics.get('Agent', {}).get('positive', []):
    print(f"  - {sentence}")

print("\nAgent's Concern Topics Represented by Sentences:")
for sentence in topics.get('Agent', {}).get('concern', []):
    print(f"  - {sentence}")

print("\nCustomer's Positive Topics Represented by Sentences:")
for sentence in topics.get('Customer', {}).get('positive', []):
    print(f"  - {sentence}")

print("\nCustomer's Concern Topics Represented by Sentences:")
for sentence in topics.get('Customer', {}).get('concern', []):
    print(f"  - {sentence}")


Agent's Positive Topics Represented by Sentences:
  - Yes, there are several riders you can add to a life insurance policy for additional benefits.
  - Whole life insurance, on the other hand, provides coverage for your entire life and also includes a savings component that can build cash value over time.
  - Premiums are based on several factors, including your age, health, lifestyle, the type of policy, and the amount of coverage you choose.

Agent's Concern Topics Represented by Sentences:
  - You can borrow against it, use it to pay premiums, or withdraw it, depending on the terms of your policy.
  - If you stop paying premiums, the outcome depends on the type of policy.
  - If you're interested in long-term financial planning and leaving a legacy, whole life insurance could be more suitable.

Customer's Positive Topics Represented by Sentences:
  - That sounds interesting.
  - That’s helpful information, Alex.
  - I appreciate your help.

Customer's Concern Topics Represented by 