In [2]:
import pandas as pd
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [3]:
# Load dataframe from pickle
sentiment_data_df = pd.read_pickle('./data/sentiment_data.pkl')

In [4]:
# Remove punctuation
sentiment_data_df['no_punctuation_text'] = sentiment_data_df['tokenized_text'].apply(lambda x: [word for word in x if word not in string.punctuation])

# Remove stopwords
custom_stopwords = set(stopwords.words('english')) - {'but', 'not', 'no', 'nor'}
sentiment_data_df['no_stopwords_text'] = sentiment_data_df['tokenized_text'].apply(lambda x: [word for word in x if word not in custom_stopwords])

# Perform stemming
stemmer = PorterStemmer()
sentiment_data_df['stemmed_text'] = sentiment_data_df['tokenized_text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Perform lemmatization
lemmatizer = WordNetLemmatizer()
sentiment_data_df['lemmatized_text'] = sentiment_data_df['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [5]:
sentiment_data_df.head()

Unnamed: 0,sentence_index,sentence,phrase,phrase ids,sentiment values,splitset_label,tokenized_text,no_punctuation_text,no_stopwords_text,stemmed_text,lemmatized_text
0,1,The Rock is destined to be the 21st Century 's...,The Rock is destined to be the 21st Century 's...,226166.0,0.69444,1,"[the, rock, is, destined, to, be, the, 21st, c...","[the, rock, is, destined, to, be, the, 21st, c...","[rock, destined, 21st, century, 's, new, ``, c...","[the, rock, is, destin, to, be, the, 21st, cen...","[the, rock, is, destined, to, be, the, 21st, c..."
1,2,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of `` Th...,226300.0,0.83333,1,"[the, gorgeously, elaborate, continuation, of,...","[the, gorgeously, elaborate, continuation, of,...","[gorgeously, elaborate, continuation, ``, lord...","[the, gorgeous, elabor, continu, of, ``, the, ...","[the, gorgeously, elaborate, continuation, of,..."
2,3,Effective but too-tepid biopic,Effective but too-tepid biopic,13995.0,0.51389,2,"[effective, but, too-tepid, biopic]","[effective, but, too-tepid, biopic]","[effective, but, too-tepid, biopic]","[effect, but, too-tepid, biopic]","[effective, but, too-tepid, biopic]"
3,4,If you sometimes like to go to the movies to h...,If you sometimes like to go to the movies to h...,14123.0,0.73611,2,"[if, you, sometimes, like, to, go, to, the, mo...","[if, you, sometimes, like, to, go, to, the, mo...","[sometimes, like, go, movies, fun, ,, wasabi, ...","[if, you, sometim, like, to, go, to, the, movi...","[if, you, sometimes, like, to, go, to, the, mo..."
4,5,"Emerges as something rare , an issue movie tha...","Emerges as something rare , an issue movie tha...",13999.0,0.86111,2,"[emerges, as, something, rare, ,, an, issue, m...","[emerges, as, something, rare, an, issue, movi...","[emerges, something, rare, ,, issue, movie, 's...","[emerg, as, someth, rare, ,, an, issu, movi, t...","[emerges, a, something, rare, ,, an, issue, mo..."


In [6]:
sentiment_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11286 entries, 0 to 11854
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   sentence_index       11286 non-null  int64  
 1   sentence             11286 non-null  object 
 2   phrase               11286 non-null  object 
 3   phrase ids           11286 non-null  float64
 4   sentiment values     11286 non-null  float64
 5   splitset_label       11286 non-null  int64  
 6   tokenized_text       11286 non-null  object 
 7   no_punctuation_text  11286 non-null  object 
 8   no_stopwords_text    11286 non-null  object 
 9   stemmed_text         11286 non-null  object 
 10  lemmatized_text      11286 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 1.0+ MB


In [7]:
max_features = 10000
max_sequence_length = 100

# Tokenize the text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(sentiment_data_df['tokenized_text'].apply(lambda x: ' '.join(x)))
sequences = tokenizer.texts_to_sequences(sentiment_data_df['tokenized_text'].apply(lambda x: ' '.join(x)))

sentiment_data = pad_sequences(sequences, maxlen=max_sequence_length)

In [9]:
train_data = sentiment_data[sentiment_data_df['splitset_label'] == 1]
test_data = sentiment_data[sentiment_data_df['splitset_label'] == 2]
dev_data = sentiment_data[sentiment_data_df['splitset_label'] == 3]

print('Train data shape: ', train_data.shape)
print('Test data shape: ', test_data.shape)
print('Dev data shape: ', dev_data.shape)

Train data shape:  (8117, 100)
Test data shape:  (2125, 100)
Dev data shape:  (1044, 100)
