In [1]:
import pandas as pd

In [2]:
## Read the dataset
df = pd.read_csv('emotion-emotion_69k - emotion-emotion_69k.csv')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,


In [4]:
df.shape

(64636, 7)

In [5]:
df.isnull().sum()

Unnamed: 0                  0
Situation                   0
emotion                     5
empathetic_dialogues        5
labels                      0
Unnamed: 5              64554
Unnamed: 6              64631
dtype: int64

In [6]:
df = df.drop(columns = ['Unnamed: 0', 'Unnamed: 5', 'Unnamed: 6'], axis =1)

In [7]:
df = df.dropna()

In [8]:
df.isnull().sum()

Situation               0
emotion                 0
empathetic_dialogues    0
labels                  0
dtype: int64

### Removal of single word

In [9]:
from nltk.tokenize import word_tokenize

In [10]:
def single_word_remove_func(text, word_2_remove):
    '''
    Removes a specific word from string, if present
    
    Step 1: Use word_tokenize() to get tokens from string
    Step 2: Removes the defined word from the created tokens
    
    Args:
        text (str): String to which the functions are to be applied, string
        word_2_remove (str): Word to be removed from the text, string
    
    Returns:
        String with removed words
    '''    
    word_to_remove = word_2_remove
    
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word != word_to_remove])
    return text

In [11]:
# Assuming your DataFrame is named df
df['empathetic_dialogues'] = df['empathetic_dialogues'].apply(lambda x: single_word_remove_func(x, "Customer :"))

In [12]:
df.head(2)

Unnamed: 0,Situation,emotion,empathetic_dialogues,labels
0,I remember going to the fireworks with my best...,sentimental,Customer : I remember going to see the firewor...,"Was this a friend you were in love with, or ju..."
1,I remember going to the fireworks with my best...,sentimental,Customer : This was a best friend . I miss her...,Where has she gone?


In [13]:
## Add a start and end token to each answer
df['Question Length'] = df['empathetic_dialogues'].str.split().apply(len)
df['Answer Length'] = df['labels'].str.split().apply(len)
df['Answers'] = df['labels'].apply(lambda x : '<start> '+ x + ' <end>')
df['QA'] = df['empathetic_dialogues'].astype(str) + ' ' + df['Answers'].astype(str)

In [15]:
from sklearn.model_selection import train_test_split

In [18]:
## Split the dataset
enc_in_train, enc_in_test, qa_train, qa_test, dec_in_train, dec_in_test = \
  train_test_split(df['empathetic_dialogues'], df['QA'], df['Answers'], test_size=0.005, random_state=42)

enc_in_train, enc_in_val, qa_train, qa_val, dec_in_train, dec_in_val = \
  train_test_split( enc_in_train, qa_train, dec_in_train, test_size=0.2, random_state=42)

In [21]:
## Import libraries
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [22]:
## Tokenize the text sequences
def create_tokenizer(lines):
    """
    Fit a tokenizer
    """
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [23]:
qa_tokenizer = create_tokenizer(df['QA'])

In [24]:
## Encode and pad the text sequences
def encode_sequences(tokenizer, length, lines):
    """
    Encode and pad sequences
    """
    # Integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # Pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [25]:
# Find max length
ques_length = df['Question Length'].max()
ans_length = df['Answer Length'].max()

In [26]:
enc_in_train = encode_sequences(qa_tokenizer, ques_length, enc_in_train)
enc_in_val = encode_sequences(qa_tokenizer, ques_length, enc_in_val)
enc_in_test = encode_sequences(qa_tokenizer, ques_length, enc_in_test)

In [27]:
dec_in_train = encode_sequences(qa_tokenizer, ans_length+2, dec_in_train)
dec_in_val = encode_sequences(qa_tokenizer, ans_length+2, dec_in_val)
dec_in_test = encode_sequences(qa_tokenizer, ans_length+2, dec_in_test)

In [28]:
import gensim.downloader as api
from gensim.models import Word2Vec

In [29]:
## Load pretrained Word2Vec word embeddings
word2vec_model = api.load("word2vec-google-news-300")



In [30]:
## Create lookup maps
qa_vocab = qa_tokenizer.word_index
word2id = dict()
id2word = dict()
for k, v in qa_vocab.items():
    word2id[k] = v
    id2word[v] = k

In [31]:
## Generate word embeddings
# create token-embedding mapping
embedding_matrix = zeros((len(qa_vocab) + 1, 300))  # Add 1 to account for the padding token
for word, i in qa_vocab.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

NameError: name 'zeros' is not defined