# Dataset Preparation

### Importing of libraries

In [1]:
import pandas as pd
import os
import pickle

import collections
from vocab import Vocab, Vectors
from wordebd import WORDEBD

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Importing of data

In [3]:
# Read the CSV file
TrainData = pd.read_csv('data/dataset_original/train_sent_emo_dya.csv', encoding='shift_jis')
TestData = pd.read_csv('data/dataset_original/test_sent_emo_dya.csv', encoding='utf-8')

# Display the first three rows
print(TrainData.shape)
print(TestData.shape)

(12840, 12)
(3400, 12)


In [4]:
TrainData.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Old_Dialogue_ID,Old_Utterance_ID,Season,Episode,StartTime,EndTime
0,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,0,0,8,21,"00:16:16,059","00:16:21,731"
1,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,0,1,8,21,"00:16:21,940","00:16:23,442"
2,That I did. That I did.,Chandler,neutral,neutral,0,2,0,2,8,21,"00:16:23,442","00:16:26,389"
3,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,0,3,8,21,"00:16:26,820","00:16:29,572"
4,My duties? All right.,Chandler,surprise,positive,0,4,0,4,8,21,"00:16:34,452","00:16:40,917"


### Dropping of Features

Drop Old_Dialogue_ID, Old_Utterance_ID, Season, Episode, StartTime, and EndTime

In [5]:
# Define features to drop
drop_features = list(TrainData.columns[6:]) 

In [6]:
# Drop features from X_train DataFrame
TrainData = TrainData.drop(drop_features, axis=1)
TestData = TestData.drop(drop_features, axis=1)

In [7]:
checkFile1 = os.path.isfile("data/dump/label_encoder.pkl")
checkFile2 = os.path.isfile("data/dump/label_decoder.pkl")

key = True

if key:
    labels = sorted(set(TrainData.Emotion))
    labelEncoder = {label: i for i, label in enumerate(labels)}
    labelDecoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(labelEncoder, open('data/dump/label_encoder.pkl', 'wb'))
    pickle.dump(labelDecoder, open('data/dump/label_decoder.pkl', 'wb'))
else:
    file1 = open('data/dump/label_encoder.pkl', 'rb')
    file2 = open('data/dump/label_decoder.pkl', 'rb')
    labelEncoder = pickle.load(file1)
    labelDecoder = pickle.load(file2)
    file1.close()
    file2.close()

In [8]:
labelEncoder

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [9]:
def encode_labels(encoder, l):
    return encoder[l]

In [10]:
# Apply label encoding to the "Emotion" column in y_train
TrainData["Emotion"] = TrainData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
TestData["Emotion"] = TestData["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))

In [11]:
def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges

In [12]:
rangesTrain = find_value_ranges(TrainData["Dialogue_ID"])
print(len(rangesTrain))

rangesTest = find_value_ranges(TestData["Dialogue_ID"])
print(len(rangesTest))

2160
577


### Speaker Encoder

Encoding speaker on train set

In [13]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/speaker_encoder_train.pkl")
encodedSpeakersTrain = []

if key:
    for range_pair in rangesTrain:
        start_idx, end_idx = range_pair
        speaker_per_dialog = TrainData['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTrain.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/speaker_encoder_train.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTrain, rangesTrain], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/speaker_encoder_train.pkl', "rb")
    encodedSpeakersTrain, rangesTrain = pickle.load(file)
    file.close()

Encoding speaker on test set

In [14]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/speaker_encoder_test.pkl")
encodedSpeakersTest = []

if key:
    for range_pair in rangesTest:
        start_idx, end_idx = range_pair
        speaker_per_dialog = TestData['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTest.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/speaker_encoder_test.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTest, rangesTest], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/speaker_encoder_test.pkl', "rb")
    encodedSpeakersTest, rangesTest = pickle.load(file)
    file.close()

Apply the encoded speakers in the Train and Test data

In [15]:
# Flatten the list of encoded speakers if it is nested
encodedSpeakersTrain_flat = [item for sublist in encodedSpeakersTrain for item in sublist]
encodedSpeakersTest_flat  = [item for sublist in encodedSpeakersTest for item in sublist]

# Replace the 'Speaker' column in TrainData with the encoded speaker data
TrainData['Speaker'] = encodedSpeakersTrain_flat
TestData['Speaker'] = encodedSpeakersTest_flat

In [16]:
TrainData.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also I was the point person on my company’s tr...,0,4,neutral,0,0
1,You must’ve had your hands full.,1,4,neutral,0,1
2,That I did. That I did.,0,4,neutral,0,2
3,So let’s talk a little bit about your duties.,1,4,neutral,0,3
4,My duties? All right.,0,6,positive,0,4


In [17]:
TestData.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,Why do all you’re coffee mugs have numbers on ...,0,6,positive,0,0
1,Oh. That’s so Monica can keep track. That way ...,1,0,negative,0,1
2,Y'know what?,1,4,neutral,0,2
3,Okay.,1,4,neutral,1,0
4,"Ross, didn't you say that there was an elevato...",0,4,neutral,1,1


### Data Pre-Processing

In [18]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Make a copy of X_train_utterances
train_utterances = TrainData['Utterance']
test_utterances = TestData['Utterance']

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize
    words = word_tokenize(text)

    # Remove punctuation and non-alphabetic characters, and stop words
    words = [word for word in words if word.isalpha() and word not in stop_words]

    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text


# Apply the clean_text function to each utterance
train_utterances = train_utterances.apply(clean_text)
test_utterances = test_utterances.apply(clean_text)

In [20]:
train_utterances.head()

0    also point person company transition system
1                                 must hand full
2                                               
3                       let talk little bit duty
4                                     duty right
Name: Utterance, dtype: object

In [21]:
test_utterances.head()

0                            coffee mug number bottom
1    oh monica keep track way one missing like number
2                                                    
3                                                okay
4                                   ross say elevator
Name: Utterance, dtype: object

Check for empty strings

In [22]:
# Count the empty string values in X_train_utterances
empty_string_count_train = (train_utterances == '').sum()
empty_string_count_test = (test_utterances == '').sum()

print(f"Empty in Train Utterances: {empty_string_count_train}")
print(f"Empty in Test Utterances: {empty_string_count_test}")

Empty in Train Utterances: 664
Empty in Test Utterances: 170


Update 'Utterance' column and remove all empty strings

In [23]:
# Update X_train with the cleaned utterances
TrainData['Utterance'] = train_utterances
TestData['Utterance'] = test_utterances

# Drop rows where Utterance is an empty string. Empty string value is ''.
TrainData = TrainData[TrainData['Utterance'] != '']
TestData = TestData[TestData['Utterance'] != '']

# Reset the index
TrainData.reset_index(drop=True, inplace=True)
TestData.reset_index(drop=True, inplace=True)

print(f"Train data shape: {TrainData.shape}")
print(f"Test data shape: {TestData.shape}")

Train data shape: (12176, 6)
Test data shape: (3230, 6)


<b>Take note of new shape of Train and Test Data after dropping the values

Remove rows with *n* or below number of words and create a new **Dropped Words Dataset**

In [24]:
n = 2 # This number removes rows with n or less words. (Retains rows with more than n words)

# Retain rows with more than n words
Dropwords_TrainData = TrainData[TrainData['Utterance'].apply(lambda x: len(x.split()) > n)]
Dropwords_TestData = TestData[TestData['Utterance'].apply(lambda x: len(x.split()) > n)]

Dropwords_TrainData.reset_index(drop=True, inplace=True)
Dropwords_TestData.reset_index(drop=True, inplace=True)

print(f"After removing rows that is {n} words or below:")
print(f"Train shape : {Dropwords_TrainData.shape}")
print(f"Test shape: {Dropwords_TestData.shape}")

After removing rows that is 2 words or below:
Train shape : (7354, 6)
Test shape: (1973, 6)


Get random sample from train and test data

In [25]:
# Randomly select 10 rows
random_sample = TrainData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
9788,hope,4
10600,ask guy question ever think alan maybe sometimes,4
2188,know ended really good time,3
5120,bra,4
633,mean important,4
7485,yes oh,3
5423,like sweater,4
10368,whoa whoa whoa treeger,6
7237,laugh play,3
5256,give puck,0


In [26]:
# Randomly select 10 rows
random_sample = TestData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
2923,ahh got big relationship looking thing serious,5
337,could press please,4
1308,uh rach move,4
372,know hug kinda roll away oh god,1
2375,oh monica still going thing,5
1858,oh found,3
304,yeah,3
1118,hey joey aww remembered even though big star,3
2000,oh god know father,6
1539,could help something,4


In [27]:
# Randomly select 10 rows
random_sample = Dropwords_TrainData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
3436,umm know hometown,4
3443,realize classroom full student,0
7126,anybody understand gon na baby soon,6
6821,oh oh god,6
2789,hardest thing ever done life,4
1000,oh well take place took stereo cause thing week,1
981,okay stay married,3
1805,avoiding want tell father,5
1911,joey please think best forget,2
7298,still touch anyone high school,4


In [28]:
# Randomly select 10 rows
random_sample = Dropwords_TestData.sample(n=10)[["Utterance", "Emotion"]]

random_sample

Unnamed: 0,Utterance,Emotion
1326,right well finish coffee let go,4
741,oh yeah smokey joe got half way highway collapsed,0
1485,bob bob bob hell,6
63,fine sorry loss,0
1719,photographer seemed really dull,4
399,based fact geller intravenous drug user,4
281,right fine fine fine fine fine got something l...,0
198,oh see suit making point,4
772,hey take back phrase last used,1
265,mean know waiting perfect guy,4


### Division of X and y data

In [29]:
# Create DataFrame for target labels
y_train = pd.DataFrame()
y_test = pd.DataFrame()

X_train = TrainData
X_test = TestData
    
y_train["Emotion"] = TrainData["Emotion"].copy()
y_test["Emotion"] = TestData["Emotion"].copy()

y_train["Dialogue_ID"] = TrainData["Dialogue_ID"].copy()
y_test["Dialogue_ID"] = TestData["Dialogue_ID"].copy()

In [30]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print('--')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (12176, 6)
Shape of y_train: (12176, 2)
--
Shape of X_test: (3230, 6)
Shape of y_test: (3230, 2)


In [31]:
# Create DataFrame for target labels
y_dropped_train = pd.DataFrame()
y_dropped_test = pd.DataFrame()

X_dropped_train = Dropwords_TrainData
X_dropped_test = Dropwords_TestData

y_dropped_train["Emotion"] = Dropwords_TrainData["Emotion"].copy()
y_dropped_test["Emotion"] = Dropwords_TestData["Emotion"].copy()

y_dropped_train["Dialogue_ID"] = Dropwords_TrainData["Dialogue_ID"].copy()
y_dropped_test["Dialogue_ID"] = Dropwords_TestData["Dialogue_ID"].copy()

In [32]:
print(f'Shape of X_dropped_train: {X_dropped_train.shape}')
print(f'Shape of y_dropped_train: {y_dropped_train.shape}')
print('--')
print(f'Shape of X_dropped_test: {X_dropped_test.shape}')
print(f'Shape of y_dropped_test: {y_dropped_test.shape}')

Shape of X_dropped_train: (7354, 6)
Shape of y_dropped_train: (7354, 2)
--
Shape of X_dropped_test: (1973, 6)
Shape of y_dropped_test: (1973, 2)


### Output data to new csv

##### Label Encoding Data

In [33]:
# Check if the file already exists
checkFile1 = os.path.isfile("data/dump/labels_train.pkl")
checkFile2 = os.path.isfile("data/dump/labels_test.pkl")

if key:
    pickle.dump(X_train["Emotion"], open('data/dump/labels_train.pkl', 'wb'))
    pickle.dump(X_test["Emotion"], open('data/dump/labels_test.pkl', 'wb'))

##### Training and Testing Data

In [34]:
def exportDataToCSV(df, name):
    path = f'data/DatasetPreparation/{name}.csv'
    df.to_csv(path)


In [35]:
exportDataToCSV(X_train, "X_train")
exportDataToCSV(y_train, "y_train")

exportDataToCSV(X_test, "X_test")
exportDataToCSV(y_test, "y_test")

In [36]:
exportDataToCSV(X_dropped_train, "dropwords_X_train")
exportDataToCSV(y_dropped_train, "dropwords_y_train")

exportDataToCSV(X_dropped_test, "dropwords_X_test")
exportDataToCSV(y_dropped_test, "dropwords_y_test")