### Importing Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

import gensim.downloader as api
from gensim.models import KeyedVectors

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Layer, Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

### Preparing Dataset

In [12]:
stop_words = set(stopwords.words("english"))

df_train = pd.read_csv("dialog.csv")
df_test = pd.read_csv("dialog_test.csv")

# Define a simple text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation/numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Create a new column with cleaned text
df_train["clean_text"] = df_train["User Dialog"].apply(clean_text)
df_test["clean_text"] = df_test["User Dialog"].apply(clean_text)

In [3]:
df_train.head()

Unnamed: 0,User Dialog,Intent Class,Subcategory,Sentiment,clean_text
0,I submitted my claim transfer request weeks ag...,Contract Modification,Claim Transfer Requests,Negative,submitted claim transfer request weeks ago tak...
1,"Seriously, are you even listening to me? I've ...",Contract Modification,Claim Transfer Requests,Negative,seriously even listening ive asked three times...
2,I'd like to request a transfer of my claim to ...,Contract Modification,Claim Transfer Requests,Neutral,id like request transfer claim different adjus...
3,I'm inquiring about the possibility of transfe...,Contract Modification,Claim Transfer Requests,Neutral,im inquiring possibility transferring claim st...
4,Thank you so much for helping me transfer my c...,Contract Modification,Claim Transfer Requests,Positive,thank much helping transfer claim really appre...


In [4]:
df_test.head()

Unnamed: 0,User Dialog,Intent Class,Subcategory,Sentiment,clean_text
0,I'm looking to transfer my claim to a differen...,Contract Modification,Claim Transfer Requests,Neutral,im looking transfer claim different adjuster w...
1,Could you please update the email address asso...,Contract Modification,Email Update,Neutral,could please update email address associated p...
2,I'm calling to check on the status of my claim...,Contract Modification,Claim Processing,Neutral,im calling check status claim processing
3,I need to file a claim for damage to my RV. Wh...,Contract Modification,RV Claims Assistance,Neutral,need file claim damage rv assist
4,What's the current status of my contract trans...,Contract Transfer,Contract Transfer Status,Neutral,whats current status contract transfer request


### Preparing Word Embeddings Model

In [5]:
try:
    google_news = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
except FileNotFoundError:
    print("Downloading pre-trained Word2Vec model...")
    google_news = api.load("word2vec-google-news-300")
    google_news.save_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
    print("Model downloaded and saved.")

embedding_dim = google_news.vector_size

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train["clean_text"])

# Convert training texts to sequences.
train_sequences = tokenizer.texts_to_sequences(df_train["clean_text"])
# Determine maximum length based on training data.
max_len = max(len(seq) for seq in train_sequences)
# Pad training sequences.
X_train = pad_sequences(train_sequences, maxlen=max_len, padding="post")

# Convert test texts to sequences using the same tokenizer.
test_sequences = tokenizer.texts_to_sequences(df_test["clean_text"])
# Pad test sequences using the same maximum length.
X_test = pad_sequences(test_sequences, maxlen=max_len, padding="post")

In [7]:
# Adding domain specific vocabulary. Need to expand.
domain_words = [
    "insurance", "policy", "claim", "deductible", "premium", "coverage",
    "accident", "collision", "damage", "repair", "replace", "reimbursement",
    "payment", "billing", "quote", "adjuster", "agent", "renewal",
    "cancellation", "vehicle", "car", "auto", "driver", "license",
    "status", "update", "process", "assistance", "modification", "inquiry",
    "report", "totaled", "rental", "roadside", "assistance", "gap", "liability",
    "comprehensive", "collision", "uninsured", "underinsured", "waiver", "discount",
    "approved", "denied", "pending", "estimate", "shop", "mechanic", "parts",
    "labor", "towing", "wreck", "fault", "accidentforgiveness", "goodstudent"
]

In [14]:
# Build the embedding matrix using the tokenizer's vocabulary.
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in google_news:
        embedding_matrix[i] = google_news[word]
    else:
        # If the word is not in GoogleNews, assign a random vector.
        embedding_matrix[i] = np.random.randn(embedding_dim)

In [15]:
# It is assumed that both datasets share the same label space.
intent_classes = sorted(df_train["Intent Class"].unique())
subcategory_classes = sorted(df_train["Subcategory"].unique())
sentiment_classes = sorted(df_train["Sentiment"].unique())

intent_to_index = {intent: idx for idx, intent in enumerate(intent_classes)}
subcategory_to_index = {sub: idx for idx, sub in enumerate(subcategory_classes)}
sentiment_to_index = {sent: idx for idx, sent in enumerate(sentiment_classes)}

# Convert labels for training set.
y_intent_train = np.array([intent_to_index[intent] for intent in df_train["Intent Class"]])
y_sub_train = np.array([subcategory_to_index[sub] for sub in df_train["Subcategory"]])
y_sentiment_train = np.array([sentiment_to_index[sent] for sent in df_train["Sentiment"]])

# Convert labels for test set.
y_intent_test = np.array([intent_to_index[intent] for intent in df_test["Intent Class"]])
y_sub_test = np.array([subcategory_to_index[sub] for sub in df_test["Subcategory"]])
y_sentiment_test = np.array([sentiment_to_index[sent] for sent in df_test["Sentiment"]])

### Model

In [10]:
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight',
                                 shape=(input_shape[-1], 1),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(name='att_bias',
                                 shape=(input_shape[1], 1),
                                 initializer='zeros',
                                 trainable=True)
        super(Attention, self).build(input_shape)
    
    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)  # (batch_size, time_steps, 1)
        a = K.softmax(e, axis=1)               # (batch_size, time_steps, 1)
        weighted_input = x * a                 # Element-wise multiplication with attention weights
        return K.sum(weighted_input, axis=1)   # Sum over time steps
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

# Model parameters.
lstm_units = 64
dropout_rate = 0.5
intent_feature_dim = 32  # Dimension for intent-based features.

# Input layer for padded sequences.
input_seq = Input(shape=(max_len,), name='input')

# Embedding layer using the pre-trained (augmented) GoogleNews embedding matrix.
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False,
                            name="embedding")(input_seq)

# Bidirectional LSTM that returns sequences (for the attention mechanism).
bi_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True), name="bilstm")(embedding_layer)

# Apply the custom attention layer.
attn = Attention(name="attention")(bi_lstm)

# Add dropout for regularization.
x = Dropout(dropout_rate, name="dropout")(attn)

# First branch: High-level Intent prediction.
intent_output = Dense(len(intent_classes), activation='softmax', name='intent')(x)

# Transform intent predictions into a feature vector.
intent_features = Dense(intent_feature_dim, activation='relu', name="intent_features")(intent_output)

# Concatenate the base representation with the intent-based features for subcategory prediction.
combined = Concatenate(name="concatenate")([x, intent_features])

# Second branch: Subcategory prediction.
subcategory_output = Dense(len(subcategory_classes), activation='softmax', name='subcategory')(combined)

# Third branch: Sentiment prediction.
# Here, we use the same base representation (x) from the attention output.
sentiment_output = Dense(len(sentiment_classes), activation='softmax', name='sentiment')(x)

# Create and compile the model with three outputs.
model = Model(inputs=input_seq, outputs=[intent_output, subcategory_output, sentiment_output])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 20)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 300)      241500      ['input[0][0]']                  
                                                                                                  
 bilstm (Bidirectional)         (None, 20, 128)      186880      ['embedding[0][0]']              
                                                                                                  
 attention (Attention)          (None, 128)          148         ['bilstm[0][0]']                 
                                                                                              

In [16]:
history = model.fit(
    X_train,
    [y_intent_train, y_sub_train, y_sentiment_train],
    validation_data=(X_test, [y_intent_test, y_sub_test, y_sentiment_test]),
    epochs=10,
    batch_size=16
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
