# Dataset Preparation

### Importing of libraries

In [2]:
import pandas as pd
import os
import pickle

import collections
from vocab import Vocab, Vectors
from wordebd import WORDEBD

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Importing of data

In [3]:
dataset_path = "dataset_drop_noise"

# Read the CSV file
TrainData = pd.read_csv('data/dataset_original/train_sent_emo_dya.csv', encoding='shift_jis')
TestData = pd.read_csv('data/dataset_original/test_sent_emo_dya.csv', encoding='utf-8')
DevData = pd.read_csv('data/dataset_original/dev_sent_emo_dya.csv', encoding='utf-8')

# Display the first three rows
print(TrainData.shape)
print(TestData.shape)
print(DevData.shape)


(12840, 12)
(3400, 12)
(1462, 12)


### Dropping of Features

Drop Old_Dialogue_ID, Old_Utterance_ID, Season, Episode, StartTime, and EndTime

In [9]:
# Define features to drop
drop_features = list(TrainData.columns[6:]) 
drop_features

[]

In [11]:
# Drop features from X_train DataFrame
if not drop_features:
    TrainData = TrainData.drop(drop_features, axis=1)
    TestData = TestData.drop(drop_features, axis=1)
    DevData = DevData.drop(drop_features, axis=1)

In [12]:
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/label_encoder.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/label_decoder.pkl")

key = True

if not (checkFile1 and checkFile2):
    labels = sorted(set(TrainData.Emotion))
    labelEncoder = {label: i for i, label in enumerate(labels)}
    labelDecoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(labelEncoder, open('data/dump/' + dataset_path + '/label_encoder.pkl', 'wb'))
    pickle.dump(labelDecoder, open('data/dump/' + dataset_path + '/label_decoder.pkl', 'wb'))
else:
    file1 = open('data/dump/' + dataset_path + '/label_encoder.pkl', 'rb')
    file2 = open('data/dump/' + dataset_path + '/label_decoder.pkl', 'rb')
    labelEncoder = pickle.load(file1)
    labelDecoder = pickle.load(file2)
    file1.close()
    file2.close()

In [13]:
labelEncoder

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [14]:
def encode_labels(encoder, l):
    return encoder[l]

In [15]:
# Apply label encoding to the "Emotion" column in y_train
TrainData["Emotion"] = TrainData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
TestData["Emotion"] = TestData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
DevData["Emotion"] = DevData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))

In [17]:
def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges

In [18]:
rangesTrain = find_value_ranges(TrainData["Dialogue_ID"])
print(len(rangesTrain))

rangesTest = find_value_ranges(TestData["Dialogue_ID"])
print(len(rangesTest))

rangesDev = find_value_ranges(DevData["Dialogue_ID"])
print(len(rangesDev))

2160
577
270


### Speaker Encoder

Encoding speaker on train set

In [19]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_train.pkl")
encodedSpeakersTrain = []

if not checkFile:
    for range_pair in rangesTrain:
        start_idx, end_idx = range_pair
        speaker_per_dialog = TrainData['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTrain.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_train.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTrain, rangesTrain], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_train.pkl', "rb")
    encodedSpeakersTrain, rangesTrain = pickle.load(file)
    file.close()

Encoding speaker on test set

In [20]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_test.pkl")
encodedSpeakersTest = []

if not checkFile:
    for range_pair in rangesTest:
        start_idx, end_idx = range_pair
        speaker_per_dialog = TestData['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTest.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_test.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTest, rangesTest], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_test.pkl', "rb")
    encodedSpeakersTest, rangesTest = pickle.load(file)
    file.close()

In [23]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_dev.pkl")
encodedSpeakersDev = []

if not checkFile:
    for range_pair in rangesDev:
        start_idx, end_idx = range_pair
        speaker_per_dialog = DevData['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersDev.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_dev.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersDev, rangesDev], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    with open('data/dump/' + dataset_path + '/speaker_encoder_dev.pkl', "rb") as file:
        encodedSpeakersDev, rangesDev = pickle.load(file)

Apply the encoded speakers in the Train and Test data

In [24]:
# Flatten the list of encoded speakers if it is nested
encodedSpeakersTrain_flat = [item for sublist in encodedSpeakersTrain for item in sublist]
encodedSpeakersTest_flat  = [item for sublist in encodedSpeakersTest for item in sublist]
encodedSpeakersDev_flat  = [item for sublist in encodedSpeakersDev for item in sublist]

# Replace the 'Speaker' column in TrainData with the encoded speaker data
TrainData['Speaker'] = encodedSpeakersTrain_flat
TestData['Speaker'] = encodedSpeakersTest_flat
DevData['Speaker'] = encodedSpeakersDev_flat

In [25]:
TrainData['Speaker'].unique()

array([0, 1], dtype=int64)

In [26]:
TestData['Speaker'].unique()

array([0, 1], dtype=int64)

In [27]:
DevData['Speaker'].unique()

array([1, 0], dtype=int64)

### Data Pre-Processing

In [28]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Make a copy of X_train_utterances
train_utterances = TrainData['Utterance']
test_utterances = TestData['Utterance']
dev_utterances = DevData['Utterance']

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\edayo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edayo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edayo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize
    words = word_tokenize(text)

    # Remove punctuation and non-alphabetic characters, and stop words
    words = [word for word in words if word.isalpha() and word not in stop_words]

    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text


# Apply the clean_text function to each utterance
train_utterances = train_utterances.apply(clean_text)
test_utterances = test_utterances.apply(clean_text)
dev_utterances = dev_utterances.apply(clean_text)

In [30]:
train_utterances.head()

0    also point person company transition system
1                                 must hand full
2                                               
3                       let talk little bit duty
4                                     duty right
Name: Utterance, dtype: object

In [31]:
test_utterances.head()

0                            coffee mug number bottom
1    oh monica keep track way one missing like number
2                                                    
3                                                okay
4                                   ross say elevator
Name: Utterance, dtype: object

In [32]:
dev_utterances.head()

0                  oh god lost totally lost
1                                          
2    could go bank close account cut source
3                                    genius
4                                    genius
Name: Utterance, dtype: object

Check for empty strings

In [33]:
# Count the empty string values in X_train_utterances
empty_string_count_train = (train_utterances == '').sum()
empty_string_count_test = (test_utterances == '').sum()
empty_string_count_dev = (dev_utterances == '').sum()

print(f"Empty in Train Utterances: {empty_string_count_train}")
print(f"Empty in Test Utterances: {empty_string_count_test}")
print(f"Empty in Dev Utterances: {empty_string_count_dev}")

Empty in Train Utterances: 664
Empty in Test Utterances: 170
Empty in Dev Utterances: 89


Update 'Utterance' column and remove all empty strings

In [34]:
# Update X_train with the cleaned utterances
TrainData['Utterance'] = train_utterances
TestData['Utterance'] = test_utterances
DevData['Utterance'] = dev_utterances

# Drop rows where Utterance is an empty string. Empty string value is ''.
TrainData = TrainData[TrainData['Utterance'] != '']
TestData = TestData[TestData['Utterance'] != '']
DevData = DevData[DevData['Utterance'] != '']

# Reset the index
TrainData.reset_index(drop=True, inplace=True)
TestData.reset_index(drop=True, inplace=True)
DevData.reset_index(drop=True, inplace=True)

print(f"Train data shape: {TrainData.shape}")
print(f"Test data shape: {TestData.shape}")
print(f"Dev data shape: {DevData.shape}")

Train data shape: (12176, 6)
Test data shape: (3230, 6)
Dev data shape: (1373, 6)


<b>Take note of new shape of Train and Test Data after dropping the values

Remove rows with *n* or below number of words and create a new **Dropped Words Dataset**

In [35]:
n = 2 # This number removes rows with n or less words. (Retains rows with more than n words)

# Retain rows with more than n words
Dropwords_TrainData = TrainData[TrainData['Utterance'].apply(lambda x: len(x.split()) > n)]
Dropwords_TestData = TestData[TestData['Utterance'].apply(lambda x: len(x.split()) > n)]
Dropwords_DevData = DevData[DevData['Utterance'].apply(lambda x: len(x.split()) > n)]

Dropwords_TrainData.reset_index(drop=True, inplace=True)
Dropwords_TestData.reset_index(drop=True, inplace=True)
Dropwords_DevData.reset_index(drop=True, inplace=True)

print(f"After removing rows that is {n} words or below:")
print(f"Train shape : {Dropwords_TrainData.shape}")
print(f"Test shape: {Dropwords_TestData.shape}")
print(f"Test shape: {Dropwords_DevData.shape}")

After removing rows that is 2 words or below:
Train shape : (7354, 6)
Test shape: (1973, 6)
Test shape: (858, 6)


Get random sample from train and test data

In [36]:
# Randomly select 10 rows
random_sample = TrainData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
9283,one beat,0
2197,got really weird message ross said turn,4
11391,come door ask,2
8405,oh know helped pick ring,4
10992,coming cool cool cool,0
9529,well know suds save,4
270,okay nodded,4
339,actually ca happen,4
6915,well yeah one,4
7059,monica snort laugh,0


In [37]:
# Randomly select 10 rows
random_sample = TestData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
2123,second date,3
20,kicked think baby kicked,6
2360,erin still,3
2963,monica come let go baby coming,4
2320,well best convince crazy girl dying get going ...,4
2210,chandler guy right,4
1145,borrow phone want call apartment check grandma...,4
1845,channie,5
2445,every moment precious,5
1933,right okay great uh chandler get behind desk,4


In [38]:
# Randomly select 10 rows
random_sample = DevData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
1085,magic story use wan na sex,0
426,got get save,2
855,thanks guy,4
1241,need talk,3
1264,screwed,3
461,anxious way help thing along,4
132,guess,4
1299,oh well listen anyway directing new al pacino ...,3
1351,tell anything,4
698,oh god slept,6


In [39]:
# Randomly select 10 rows
random_sample = Dropwords_TrainData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
6079,look see happening,0
1911,joey please think best forget,2
4942,chandler love come tonight,3
768,matter never gon na get meet anyway,5
5583,rachel green marry,3
743,better go museum underwear,3
4310,another one never party,6
5799,um um rachel talk sec,4
6424,right uh listen say crank notch,3
2802,know one poughkeepsie even though two hour tra...,4


In [40]:
# Randomly select 10 rows
random_sample = Dropwords_TestData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
1970,yeah fade accent people think know adjusting l...,0
613,guy please watch right,1
1670,nope nope ah rather talk,4
1808,ca lokk nice might doctor,4
645,yeah help get past security guard,4
575,rach look oh hi strong ross skywalker come rescue,3
1039,guy going come one game,6
452,okay uh temporarily call clint,4
1230,oh god know father,6
1264,okay finish apple juice,4


In [41]:
# Randomly select 10 rows
random_sample = Dropwords_DevData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
255,oh let tell story,5
487,actually thought going,4
362,well thinking think best way would,4
68,nothing monica stupid fight,5
589,fact time right,4
493,well actually eat mine still bathroom,3
338,okay wait wait,4
786,hey father house,6
49,wow look like got lot good stuff,3
834,sadly could enticed,3


### Division of X and y data

In [42]:
# Create DataFrame for target labels
y_train = pd.DataFrame()
y_test = pd.DataFrame()
y_dev = pd.DataFrame()

X_train = TrainData
X_test = TestData
X_dev = DevData
    
y_train["Emotion"] = TrainData["Emotion"].copy()
y_test["Emotion"] = TestData["Emotion"].copy()
y_dev["Emotion"] = DevData["Emotion"].copy()

y_train["Dialogue_ID"] = TrainData["Dialogue_ID"].copy()
y_test["Dialogue_ID"] = TestData["Dialogue_ID"].copy()
y_dev["Dialogue_ID"] = DevData["Dialogue_ID"].copy()

In [43]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print('--')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')
print('--')
print(f'Shape of X_dev: {X_dev.shape}')
print(f'Shape of y_dev: {y_dev.shape}')

Shape of X_train: (12176, 6)
Shape of y_train: (12176, 2)
--
Shape of X_test: (3230, 6)
Shape of y_test: (3230, 2)
--
Shape of X_dev: (1373, 6)
Shape of y_dev: (1373, 2)


In [44]:
# Create DataFrame for target labels
y_dropped_train = pd.DataFrame()
y_dropped_test = pd.DataFrame()
y_dropped_dev = pd.DataFrame()

X_dropped_train = Dropwords_TrainData
X_dropped_test = Dropwords_TestData
X_dropped_dev = Dropwords_DevData

y_dropped_train["Emotion"] = Dropwords_TrainData["Emotion"].copy()
y_dropped_test["Emotion"] = Dropwords_TestData["Emotion"].copy()
y_dropped_dev["Emotion"] = Dropwords_DevData["Emotion"].copy()

y_dropped_train["Dialogue_ID"] = Dropwords_TrainData["Dialogue_ID"].copy()
y_dropped_test["Dialogue_ID"] = Dropwords_TestData["Dialogue_ID"].copy()
y_dropped_dev["Dialogue_ID"] = Dropwords_DevData["Dialogue_ID"].copy()

In [45]:
print(f'Shape of X_dropped_train: {X_dropped_train.shape}')
print(f'Shape of y_dropped_train: {y_dropped_train.shape}')
print('--')
print(f'Shape of X_dropped_test: {X_dropped_test.shape}')
print(f'Shape of y_dropped_test: {y_dropped_test.shape}')
print('--')
print(f'Shape of X_dropped_dev: {X_dropped_dev.shape}')
print(f'Shape of y_dropped_dev: {y_dropped_dev.shape}')

Shape of X_dropped_train: (7354, 6)
Shape of y_dropped_train: (7354, 2)
--
Shape of X_dropped_test: (1973, 6)
Shape of y_dropped_test: (1973, 2)
--
Shape of X_dropped_dev: (858, 6)
Shape of y_dropped_dev: (858, 2)


### Output data to new csv

##### Label Encoding Data

In [46]:
# Check if the file already exists
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/labels_train.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/labels_test.pkl")

if key:
    pickle.dump(X_train["Emotion"], open('data/dump/' + dataset_path + '/labels_train.pkl', 'wb'))
    pickle.dump(X_test["Emotion"], open('data/dump/' + dataset_path + '/labels_test.pkl', 'wb'))

##### Training and Testing Data

In [55]:
def exportDataToCSV(df, name):
    path = "data/" + dataset_path + "/" + name + ".csv"
    df.to_csv(path)


In [56]:
exportDataToCSV(X_train, "train_sent_emo_dya")
exportDataToCSV(y_train, "y_train")

exportDataToCSV(X_test, "test_sent_emo_dya")
exportDataToCSV(y_test, "y_test")

exportDataToCSV(X_dev, "dev_sent_emo_dya")
exportDataToCSV(y_dev, "y_dev")

In [57]:
exportDataToCSV(X_dropped_train, "dropwords_X_train")
exportDataToCSV(y_dropped_train, "dropwords_y_train")

exportDataToCSV(X_dropped_test, "dropwords_X_test")
exportDataToCSV(y_dropped_test, "dropwords_y_test")

exportDataToCSV(X_dropped_dev, "dropwords_X_dev")
exportDataToCSV(y_dropped_dev, "dropwords_y_dev")