# Dataset Preparation

### Importing of libraries

In [132]:
import pandas as pd
import os
import pickle

import collections
from vocab import Vocab, Vectors
from wordebd import WORDEBD

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Importing of data

In [133]:
dataset_path = "dataset_drop_noise"

# Read the CSV file
TrainData = pd.read_csv('data/dataset_original/train_sent_emo_dya.csv', encoding='shift_jis')
TestData = pd.read_csv('data/dataset_original/test_sent_emo_dya.csv', encoding='utf-8')
DevData = pd.read_csv('data/dataset_original/dev_sent_emo_dya.csv', encoding='utf-8')

# Display the first three rows
print(TrainData.shape)
print(TestData.shape)
print(DevData.shape)


(12840, 12)
(3400, 12)
(1462, 12)


### Dropping of Features

Drop Old_Dialogue_ID, Old_Utterance_ID, Season, Episode, StartTime, and EndTime

In [134]:
# Define features to drop
drop_features = list(TrainData.columns[6:]) 
drop_features

['Old_Dialogue_ID',
 'Old_Utterance_ID',
 'Season',
 'Episode',
 'StartTime',
 'EndTime']

In [135]:
# Drop features from X_train DataFrame
if not drop_features:
    TrainData = TrainData.drop(drop_features, axis=1)
    TestData = TestData.drop(drop_features, axis=1)
    DevData = DevData.drop(drop_features, axis=1)

In [136]:
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/label_encoder.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/label_decoder.pkl")

key = True

if not (checkFile1 and checkFile2):
    labels = sorted(set(TrainData.Emotion))
    labelEncoder = {label: i for i, label in enumerate(labels)}
    labelDecoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(labelEncoder, open('data/dump/' + dataset_path + '/label_encoder.pkl', 'wb'))
    pickle.dump(labelDecoder, open('data/dump/' + dataset_path + '/label_decoder.pkl', 'wb'))
else:
    file1 = open('data/dump/' + dataset_path + '/label_encoder.pkl', 'rb')
    file2 = open('data/dump/' + dataset_path + '/label_decoder.pkl', 'rb')
    labelEncoder = pickle.load(file1)
    labelDecoder = pickle.load(file2)
    file1.close()
    file2.close()

In [137]:
labelEncoder

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [138]:
def encode_labels(encoder, l):
    return encoder[l]

In [139]:
# Apply label encoding to the "Emotion" column in y_train
TrainData["Emotion"] = TrainData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
TestData["Emotion"] = TestData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
DevData["Emotion"] = DevData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))

In [140]:
def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges

In [141]:
rangesTrain = find_value_ranges(TrainData["Dialogue_ID"])
print(len(rangesTrain))

rangesTest = find_value_ranges(TestData["Dialogue_ID"])
print(len(rangesTest))

rangesDev = find_value_ranges(DevData["Dialogue_ID"])
print(len(rangesDev))

2160
577
270


### Speaker Encoder

Encoding speaker on train set

In [142]:
# # Check if the file exists
# checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_train.pkl")
# encodedSpeakersTrain = []

# if not checkFile:
#     for range_pair in rangesTrain:
#         start_idx, end_idx = range_pair
#         speaker_per_dialog = TrainData['Speaker'][start_idx:end_idx + 1].copy()
#         speaker_feature = sorted(set(speaker_per_dialog))
#         speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
#         speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

#         encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
#         encodedSpeakersTrain.append(encoded_speaker)

#     # Save encoded speaker list and ranges to a file using pickle
#     file_path = 'data/dump/' + dataset_path + '/speaker_encoder_train.pkl'
#     with open(file_path, 'wb') as file:
#         pickle.dump([encodedSpeakersTrain, rangesTrain], file)
# else:
#     # Load encoded speaker list and ranges from the existing pickle file
#     file = open('data/dump/' + dataset_path + '/speaker_encoder_train.pkl', "rb")
#     encodedSpeakersTrain, rangesTrain = pickle.load(file)
#     file.close()

Encoding speaker on test set

In [143]:
# # Check if the file exists
# checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_test.pkl")
# encodedSpeakersTest = []

# if not checkFile:
#     for range_pair in rangesTest:
#         start_idx, end_idx = range_pair
#         speaker_per_dialog = TestData['Speaker'][start_idx:end_idx + 1].copy()
#         speaker_feature = sorted(set(speaker_per_dialog))
#         speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
#         speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

#         encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
#         encodedSpeakersTest.append(encoded_speaker)

#     # Save encoded speaker list and ranges to a file using pickle
#     file_path = 'data/dump/' + dataset_path + '/speaker_encoder_test.pkl'
#     with open(file_path, 'wb') as file:
#         pickle.dump([encodedSpeakersTest, rangesTest], file)
# else:
#     # Load encoded speaker list and ranges from the existing pickle file
#     file = open('data/dump/' + dataset_path + '/speaker_encoder_test.pkl', "rb")
#     encodedSpeakersTest, rangesTest = pickle.load(file)
#     file.close()

In [144]:
# # Check if the file exists
# checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_dev.pkl")
# encodedSpeakersDev = []

# if not checkFile:
#     for range_pair in rangesDev:
#         start_idx, end_idx = range_pair
#         speaker_per_dialog = DevData['Speaker'][start_idx:end_idx + 1].copy()
#         speaker_feature = sorted(set(speaker_per_dialog))
#         speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
#         speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

#         encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
#         encodedSpeakersDev.append(encoded_speaker)

#     # Save encoded speaker list and ranges to a file using pickle
#     file_path = 'data/dump/' + dataset_path + '/speaker_encoder_dev.pkl'
#     with open(file_path, 'wb') as file:
#         pickle.dump([encodedSpeakersDev, rangesDev], file)
# else:
#     # Load encoded speaker list and ranges from the existing pickle file
#     with open('data/dump/' + dataset_path + '/speaker_encoder_dev.pkl', "rb") as file:
#         encodedSpeakersDev, rangesDev = pickle.load(file)

Apply the encoded speakers in the Train and Test data

In [145]:
# Flatten the list of encoded speakers if it is nested
# encodedSpeakersTrain_flat = [item for sublist in encodedSpeakersTrain for item in sublist]
# encodedSpeakersTest_flat  = [item for sublist in encodedSpeakersTest for item in sublist]
# encodedSpeakersDev_flat  = [item for sublist in encodedSpeakersDev for item in sublist]

# Replace the 'Speaker' column in TrainData with the encoded speaker data
# TrainData['Speaker'] = encodedSpeakersTrain_flat
# TestData['Speaker'] = encodedSpeakersTest_flat
# DevData['Speaker'] = encodedSpeakersDev_flat

In [146]:
TrainData['Speaker'].unique()

array(['Chandler', 'The Interviewer', 'Joey', 'Rachel', 'Monica',
       'Phoebe', 'Ross', 'Sergei', 'Customer', 'Jade', 'Mona', 'Charlie',
       'Paleontologist', 'Professore Clerk', 'Caitlin', 'Nurse',
       'Mr. Treeger', 'Carol', 'The Casting Director', 'Emily',
       'Elizabeth', 'Paul', 'The Dry Cleaner', 'Joey and Chandler',
       'Kate', 'The Director', 'Mr. Tribbiani', 'Guru Saj', 'Wayne',
       'Richard', 'Dina', 'Bobby', 'Danny', 'Krista', 'Jill', 'Stevens',
       'Doug', 'Bob', 'Mr. Franklin', 'Director', 'Janice', 'Tony',
       'Peter', 'Ticket Counter Attendant', 'Dr. Long', 'Charlton Heston',
       'Joshua', 'Nancy', 'Kim', 'Joanna', 'Cassie', 'Dr. Rhodes',
       'Dr. Johnson', 'Kristen', 'Jester', 'Sarah', 'Pete',
       'The Singing Man', 'Commercial', 'Mark', 'A Female Student', 'All',
       'Cliff', 'Tag', 'Eric', 'Dr. Green', 'Mr. Heckles', 'Mr. Geller',
       'Sophie', 'Singer', 'David', 'Hitchhiker', '1st Customer',
       '2nd Customer', '3rd Customer'

In [147]:
TestData['Speaker'].unique()

array(['Mark', 'Rachel', 'Ross', 'Steve', 'Phoebe', 'Monica', 'Joey',
       'Chandler', 'Marjorie', 'Jade', 'Fireman #1', 'Roger', 'Gary',
       'A Waiter', 'The Waiter', 'Janice', 'Duncan', 'Danny', 'Man',
       'Phoebe Sr', 'All', 'Carol', 'Dr. Franzblau', 'Student',
       'Mr. Geller', 'Mrs. Geller', 'Mrs. Bing', 'Mr. Bing', 'Leslie',
       'Phoebe and Leslie', 'Female Student', 'Judge', 'Chip', 'Kathy',
       'Gunther', 'Flight Attendant', 'Larry', 'Molly',
       'Ross and Chandler', 'Woman', 'The Director',
       'The Casting Director', 'Nurse', 'Emily', 'Frank Sr.', 'Host',
       'Mike', 'Wayne', 'Paul', 'Guest #1', 'Cliff', 'Lorraine', 'Guy',
       'Sarah', 'Lydia', 'The Instructor', 'Dana',
       'Ticket Counter Attendant', 'Brenda', 'Director',
       'Rachel and Bonnie', 'Bonnie', 'Mr. Waltham', 'Jill', 'Eric',
       'Frannie', 'Frank', 'Mona', 'Doug', 'Joanna', 'Sophie', 'Chloe',
       'The Teacher', 'The Stripper', 'Elizabeth', 'Tag', 'Female Clerk',
       'Ki

In [148]:
DevData['Speaker'].unique()

array(['Phoebe', 'Monica', 'Ross', 'Chandler', 'Joey', 'All', 'Rachel',
       'Estelle', 'Gary', 'Guy', 'Woman', 'Mrs. Green', 'Dr. Harad',
       'Frank', 'Alice', 'Bob', 'Whitney', 'Kyle', 'Passerby', 'Susan',
       'Cookie', 'Kori', 'Dr. Long', 'Carol', 'TV Announcer',
       'The Hot Girl', 'Jen', 'Monica and Rachel', 'The Dry Cleaner',
       'Carl', 'Waitress', 'Janine', 'Stage Manager', 'Lauren', 'Kate',
       'Mrs. Geller', 'Joanna', 'Tag', 'Max', 'Elizabeth', 'Charlie',
       'Doctor', 'Cliff', 'Jeannine', 'Ross and Rachel', 'Ursula'],
      dtype=object)

### Data Pre-Processing

In [149]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Make a copy of X_train_utterances
train_utterances = TrainData['Utterance']
test_utterances = TestData['Utterance']
dev_utterances = DevData['Utterance']

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\edayo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edayo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edayo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [150]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize
    words = word_tokenize(text)

    # Remove punctuation and non-alphabetic characters, and stop words
    words = [word for word in words if word.isalpha() and word not in stop_words]

    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text


# Apply the clean_text function to each utterance
train_utterances = train_utterances.apply(clean_text)
test_utterances = test_utterances.apply(clean_text)
dev_utterances = dev_utterances.apply(clean_text)

In [151]:
train_utterances.head()

0    also point person company transition system
1                                 must hand full
2                                               
3                       let talk little bit duty
4                                     duty right
Name: Utterance, dtype: object

In [152]:
test_utterances.head()

0                            coffee mug number bottom
1    oh monica keep track way one missing like number
2                                                    
3                                                okay
4                                   ross say elevator
Name: Utterance, dtype: object

In [153]:
dev_utterances.head()

0                  oh god lost totally lost
1                                          
2    could go bank close account cut source
3                                    genius
4                                    genius
Name: Utterance, dtype: object

Check for empty strings

In [154]:
# Count the empty string values in X_train_utterances
empty_string_count_train = (train_utterances == '').sum()
empty_string_count_test = (test_utterances == '').sum()
empty_string_count_dev = (dev_utterances == '').sum()

print(f"Empty in Train Utterances: {empty_string_count_train}")
print(f"Empty in Test Utterances: {empty_string_count_test}")
print(f"Empty in Dev Utterances: {empty_string_count_dev}")

Empty in Train Utterances: 664
Empty in Test Utterances: 170
Empty in Dev Utterances: 89


Update 'Utterance' column and remove all empty strings

In [155]:
# Update X_train with the cleaned utterances
TrainData['Utterance'] = train_utterances
TestData['Utterance'] = test_utterances
DevData['Utterance'] = dev_utterances

# Drop rows where Utterance is an empty string. Empty string value is ''.
TrainData = TrainData[TrainData['Utterance'] != '']
TestData = TestData[TestData['Utterance'] != '']
DevData = DevData[DevData['Utterance'] != '']

# Reset the index
TrainData.reset_index(drop=True, inplace=True)
TestData.reset_index(drop=True, inplace=True)
DevData.reset_index(drop=True, inplace=True)

print(f"Train data shape: {TrainData.shape}")
print(f"Test data shape: {TestData.shape}")
print(f"Dev data shape: {DevData.shape}")

Train data shape: (12176, 12)
Test data shape: (3230, 12)
Dev data shape: (1373, 12)


<b>Take note of new shape of Train and Test Data after dropping the values

Remove rows with *n* or below number of words and create a new **Dropped Words Dataset**

In [156]:
n = 2 # This number removes rows with n or less words. (Retains rows with more than n words)

# Retain rows with more than n words
Dropwords_TrainData = TrainData[TrainData['Utterance'].apply(lambda x: len(x.split()) > n)]
Dropwords_TestData = TestData[TestData['Utterance'].apply(lambda x: len(x.split()) > n)]
Dropwords_DevData = DevData[DevData['Utterance'].apply(lambda x: len(x.split()) > n)]

Dropwords_TrainData.reset_index(drop=True, inplace=True)
Dropwords_TestData.reset_index(drop=True, inplace=True)
Dropwords_DevData.reset_index(drop=True, inplace=True)

print(f"After removing rows that is {n} words or below:")
print(f"Train shape : {Dropwords_TrainData.shape}")
print(f"Test shape: {Dropwords_TestData.shape}")
print(f"Test shape: {Dropwords_DevData.shape}")

After removing rows that is 2 words or below:
Train shape : (7354, 12)
Test shape: (1973, 12)
Test shape: (858, 12)


Get random sample from train and test data

In [157]:
# Randomly select 10 rows
random_sample = TrainData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
766,sound great,6
7825,guy got anything eat went johnos chicken closed,6
4314,take hand,4
9026,something wrong,4
2007,every year,4
5576,definitely,3
5739,really think room,4
10360,true,4
4190,double,3
10498,yeah feel kinda responsible,0


In [158]:
# Randomly select 10 rows
random_sample = TestData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
1890,joke well think hef would disagree sent check ...,4
2675,yeah,4
2463,go chandler,4
1278,love let tell friend,3
1776,know face ross,0
1772,know every year say gon na send holiday card n...,4
2453,yes saying actual word,4
1749,oh great joey want check picture mona ice skating,3
3095,oh god saw oh,6
89,oh bad really bad thing burned as,5


In [159]:
# Randomly select 10 rows
random_sample = DevData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
630,yeah reason get big,4
1185,come taste bad,4
1198,know knew probably give real name either,4
43,exactly looking hmm,4
703,hey mon,3
474,ok talking,4
173,musta read sonogram wrong,6
405,oh know know one joey got monica turkey stuck ...,3
349,idiot,1
300,leave,5


In [160]:
# Randomly select 10 rows
random_sample = Dropwords_TrainData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
714,right definitely taste nutmeg,4
1427,well least hearing first time fifth grade hall...,1
4503,oh god really hope,3
4562,oh please could,4
5896,yeah okay scott,4
1294,okay sir let see got right,4
2409,musta sweeping found broom hand,4
5482,little toast ding ding,3
3792,taping people day,4
6399,like bug bunny cartoon bug playing position ri...,3


In [161]:
# Randomly select 10 rows
random_sample = Dropwords_TestData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
1528,hi remember u,4
928,people people people,4
340,one hot chicky,3
784,know creep went date go find new one,1
522,think next patient far along,4
1921,think hear sound,3
1360,well old dog year think snoopy still allowed f...,5
150,right look bottom line ross fixable act fast okay,4
1363,oh sure need train somebody new,3
1937,well anything copy going,4


In [162]:
# Randomly select 10 rows
random_sample = Dropwords_DevData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
282,heaven door sure pressing ear listening intently,4
710,good bone bruise right puncture wound ring,4
339,hey call get okay,4
628,oh hi max hey know everybody,4
343,jen know may sound uh would maybe wan na grab ...,4
66,go barn undress hold,4
812,joey talking terrific actor,4
372,know oh god genius,3
97,already baby leave alone,0
427,tell look adrienne baby gon na want meet,4


### Division of X and y data

In [163]:
# Create DataFrame for target labels
y_train = pd.DataFrame()
y_test = pd.DataFrame()
y_dev = pd.DataFrame()

X_train = TrainData
X_test = TestData
X_dev = DevData
    
y_train["Emotion"] = TrainData["Emotion"].copy()
y_test["Emotion"] = TestData["Emotion"].copy()
y_dev["Emotion"] = DevData["Emotion"].copy()

y_train["Dialogue_ID"] = TrainData["Dialogue_ID"].copy()
y_test["Dialogue_ID"] = TestData["Dialogue_ID"].copy()
y_dev["Dialogue_ID"] = DevData["Dialogue_ID"].copy()

In [164]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print('--')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')
print('--')
print(f'Shape of X_dev: {X_dev.shape}')
print(f'Shape of y_dev: {y_dev.shape}')

Shape of X_train: (12176, 12)
Shape of y_train: (12176, 2)
--
Shape of X_test: (3230, 12)
Shape of y_test: (3230, 2)
--
Shape of X_dev: (1373, 12)
Shape of y_dev: (1373, 2)


In [165]:
# Create DataFrame for target labels
y_dropped_train = pd.DataFrame()
y_dropped_test = pd.DataFrame()
y_dropped_dev = pd.DataFrame()

X_dropped_train = Dropwords_TrainData
X_dropped_test = Dropwords_TestData
X_dropped_dev = Dropwords_DevData

y_dropped_train["Emotion"] = Dropwords_TrainData["Emotion"].copy()
y_dropped_test["Emotion"] = Dropwords_TestData["Emotion"].copy()
y_dropped_dev["Emotion"] = Dropwords_DevData["Emotion"].copy()

y_dropped_train["Dialogue_ID"] = Dropwords_TrainData["Dialogue_ID"].copy()
y_dropped_test["Dialogue_ID"] = Dropwords_TestData["Dialogue_ID"].copy()
y_dropped_dev["Dialogue_ID"] = Dropwords_DevData["Dialogue_ID"].copy()

In [166]:
print(f'Shape of X_dropped_train: {X_dropped_train.shape}')
print(f'Shape of y_dropped_train: {y_dropped_train.shape}')
print('--')
print(f'Shape of X_dropped_test: {X_dropped_test.shape}')
print(f'Shape of y_dropped_test: {y_dropped_test.shape}')
print('--')
print(f'Shape of X_dropped_dev: {X_dropped_dev.shape}')
print(f'Shape of y_dropped_dev: {y_dropped_dev.shape}')

Shape of X_dropped_train: (7354, 12)
Shape of y_dropped_train: (7354, 2)
--
Shape of X_dropped_test: (1973, 12)
Shape of y_dropped_test: (1973, 2)
--
Shape of X_dropped_dev: (858, 12)
Shape of y_dropped_dev: (858, 2)


### Output data to new csv

##### Label Encoding Data

In [167]:
# Check if the file already exists
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/labels_train.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/labels_test.pkl")

if key:
    pickle.dump(X_train["Emotion"], open('data/dump/' + dataset_path + '/labels_train.pkl', 'wb'))
    pickle.dump(X_test["Emotion"], open('data/dump/' + dataset_path + '/labels_test.pkl', 'wb'))

##### Training and Testing Data

In [168]:
def exportDataToCSV(df, name):
    path = "data/" + dataset_path + "/" + name + ".csv"
    df.to_csv(path)


In [171]:
exportDataToCSV(X_train, "train_sent_emo_dya")
exportDataToCSV(y_train, "y_train")

exportDataToCSV(X_test, "test_sent_emo_dya")
exportDataToCSV(y_test, "y_test")

exportDataToCSV(X_dev, "dev_sent_emo_dya")
exportDataToCSV(y_dev, "y_dev")

In [170]:
# exportDataToCSV(X_dropped_train, "dropwords_X_train")
# exportDataToCSV(y_dropped_train, "dropwords_y_train")

# exportDataToCSV(X_dropped_test, "dropwords_X_test")
# exportDataToCSV(y_dropped_test, "dropwords_y_test")

# exportDataToCSV(X_dropped_dev, "dropwords_X_dev")
# exportDataToCSV(y_dropped_dev, "dropwords_y_dev")