# Przetwarzanie języka naturalnego, PWR 2024
## Analiza sentymentu recenzji filmów
Autorzy:
- Dominik Ćwikowski 248914
- Konrad Maciejczyk 

Repozytorium:  
- https://github.com/F3mte/NLP-projekt

Zbiór danych:  
- https://www.kaggle.com/competitions/sentiment-analysis-on-movie-reviews

In [19]:
# import basic labriaries
import numpy as np 
import pandas as pd 
import os
import pickle
# disable warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# list files from input directory
print(os.listdir("./input"))

['sampleSubmission.csv', 'test.tsv', 'train.tsv']


In [2]:
# NLTK (The Natural Language Toolkit) is a suite of libraries and programs for symbolic and 
# statistical natural language processing for English written in the Python programming language.
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup
import re
# TQDM is a progress bar library with good support for nested loops and Jupyter/IPython notebooks.
from tqdm import tqdm

In [3]:
# We all, of course, know what Keras is
from keras.utils import to_categorical, pad_sequences
import random
from tensorflow.random import set_seed
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, SimpleRNN 
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam, RMSprop, SGD, Adadelta, Adagrad
from keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.config import list_physical_devices
# set random seed for the session and also for tensorflow that runs in background for keras
set_seed(42)
random.seed(42)
# check available GPUs
print(f"Num GPUs Available: {len(list_physical_devices('GPU'))}")

Num GPUs Available: 0


In [4]:
# read and check training data
train= pd.read_csv("./input/train.tsv", sep="\t")
print(train.shape)
train.head()

(156060, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
# read and check test data
test = pd.read_csv("./input/test.tsv", sep="\t")
print(test.shape)
test.head()

(66292, 3)


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
# function for cleaning sentenes from data set
def clean_sentences(df):
    reviews = []
    df.dropna(axis=0, inplace=True)                            # remove empty values
    for sent in tqdm(df['Phrase']):
        review_text = BeautifulSoup(sent).get_text()           # remove html content
        review_text = re.sub("[^a-zA-Z]"," ", review_text)     # remove non-alphabetic characters
        words = word_tokenize(review_text.lower())             # tokenize the sentences
        lemma_words = [lemmatizer.lemmatize(i) for i in words] # lemmatize each word
        reviews.append(lemma_words)                            # add to list that will be returned
    return(reviews)

In [7]:
# install necessary resources
nltk.download('punkt')
nltk.download('wordnet')
#nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)
print(f"Length of training sequence: {len(train_sentences)}")
print(f"Length of testing sequence:  {len(test_sentences)}")

  review_text = BeautifulSoup(sent).get_text()           # remove html content
100%|███████████████████████████████████████████████████████████████████████| 156060/156060 [00:13<00:00, 11741.40it/s]
100%|█████████████████████████████████████████████████████████████████████████| 66291/66291 [00:05<00:00, 12510.16it/s]

Length of training sequence: 156060
Length of testing sequence:  66291





In [9]:
# collect the dependent values and convert to one-hot encoded output using to_categorical
y_target = to_categorical(train.Sentiment.values)
num_classes = y_target.shape[1]
y_target

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [10]:
# split into train and validation subsets
X_train, X_val, y_train, y_val = train_test_split(train_sentences, y_target, test_size=0.1, stratify=y_target)

In [11]:
# create set of unique words and find the longest sentence length to determine the padding needed for other sentences to match it
unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    unique_words.update(sent)
    if(len_max<len(sent)):
        len_max = len(sent)

print(f"Number of unique words:         {len(list(unique_words))}")
print(f"Length of the longest sentence: {len_max}")

100%|█████████████████████████████████████████████████████████████████████| 140454/140454 [00:00<00:00, 1134990.83it/s]

Number of unique words:         13745
Length of the longest sentence: 48





In [12]:
# create tokenizer
#X_train = map(str, X_train)
#X_val = map(str, X_val)
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

In [13]:
# pass subsets through tokenizer
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)

In [14]:
# padding done to equalize the lengths of all input reviews. LSTM networks needs all inputs to be same length.
# Therefore reviews lesser than max length will be made equal using extra zeros at end. This is padding.
X_train = pad_sequences(X_train, maxlen=len_max)
X_val = pad_sequences(X_val, maxlen=len_max)
X_test = pad_sequences(X_test, maxlen=len_max)

print(X_train.shape, X_val.shape, X_test.shape)

(140454, 48) (15606, 48) (66291, 48)


In [15]:
# function that create model
def create_model(architecture, optimizer, lr):
    '''
    architecture - choose among 4 possible model architectures:
                   'LSTM-GRU', 'LSTM', 'GRU', 'RNN'
    optimizer - choose among 5 possible optimizers:
                'Adam', 'RMSprop', 'SGD', 'Momentum', 'Adadelta'
    lr - choose learning rate for optimizer
    '''
    model = Sequential()
    model.add(Embedding(len(list(unique_words)), 50, input_length=len_max)),
    # choose model qrchitecture
    if architecture == 'LSTM-GRU':
        model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)),
        model.add(GRU(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=False)),
    elif architecture == 'LSTM':
        model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)),
        model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=False)),
    elif architecture == 'GRU':
        model.add(GRU(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)),
        model.add(GRU(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=False)),
    elif architecture == 'RNN':
        model.add(SimpleRNN(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)),
        model.add(SimpleRNN(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=False)),
    # add last dense layers
    model.add(Dense(len(list(unique_words)), activation='relu')),
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    # compile model and print summary
    if optimizer == 'Adam':
        model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=lr), metrics=['accuracy'])
    elif optimizer == 'RMSprop':
        model.compile(loss='categorical_crossentropy', optimizer=RMSprop(learning_rate=lr), metrics=['accuracy'])
    elif optimizer == 'SGD':
        model.compile(loss='categorical_crossentropy', optimizer=SGD(learning_rate=lr), metrics=['accuracy'])
    elif optimizer == 'Momentum':
        model.compile(loss='categorical_crossentropy', optimizer=SGD(learning_rate=lr, momentum=0.9), metrics=['accuracy'])
    elif optimizer == 'Adadelta':
        model.compile(loss='categorical_crossentropy', optimizer=Adadelta(learning_rate=lr), metrics=['accuracy'])
    elif optimizer == 'Adagrad':
        model.compile(loss='categorical_crossentropy', optimizer=Adagrad(learning_rate=lr), metrics=['accuracy'])
    # print summary and return model
    model.summary()
    return model

In [16]:
# function for plotting accuraccy
def plot_accuracies(history):
    acc=history.history['accuracy']
    val_acc=history.history['val_accuracy']
    loss=history.history['loss']
    val_loss=history.history['val_loss']

    epochs=range(len(acc)) # Get number of epochs

    plt.plot(epochs, acc, 'r')
    plt.plot(epochs, val_acc, 'b')
    plt.title('Training and testing accuracy')
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(['Training', 'testing'])
    plt.show()
    print("")

    plt.plot(epochs, loss, 'r')
    plt.plot(epochs, val_loss, 'b')
    plt.title('Training and testing loss')
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(['Training', 'testing'])
    plt.show()

In [18]:
# create inital model to perform learning rate tuning
# Adagrad is used for now, later will be discarded for other optimizer
model_name = 'learning_rate_0dot01'
model = create_model(architecture='LSTM-GRU',
                                    optimizer='Adagrad',
                                    lr=0.01)
# checkpoint callback to save best model during hyper-parameters tuning step
checkpoint = ModelCheckpoint(f"hyperpara_tuning\\{model_name}.keras", monitor='val_accuracy', save_best_only=True, mode='max')
# fit model
history = model.fit(X_train, 
                    y_train, 
                    validation_data=(X_val, y_val), 
                    epochs=10, 
                    batch_size=16,
                    callbacks=[checkpoint],
                    verbose=1)
# save history
with open(f"hyperpara_tuning\\{model_name}_history.pkl", 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
# display results
plot_accuracies(history)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 48, 50)            687250    
                                                                 
 lstm_1 (LSTM)               (None, 48, 64)            29440     
                                                                 
 gru_1 (GRU)                 (None, 64)                24960     
                                                                 
 dense_2 (Dense)             (None, 13745)             893425    
                                                                 
 dropout_1 (Dropout)         (None, 13745)             0         
                                                                 
 dense_3 (Dense)             (None, 5)                 68730     
                                                                 
Total params: 1,703,805
Trainable params: 1,703,805
No

NameError: name 'pickle' is not defined

In [None]:
model_name = 'learning_rate_0dot005'
model = create_model(architecture='LSTM-GRU',
                                    optimizer='Adagrad',
                                    lr=0.005)
# checkpoint callback to save best model during hyper-parameters tuning step
checkpoint = ModelCheckpoint(f"hyperpara_tuning\\{model_name}.keras", monitor='val_accuracy', save_best_only=True, mode='max')
# fit model
history = model.fit(X_train, 
                    y_train, 
                    validation_data=(X_val, y_val), 
                    epochs=10, 
                    batch_size=16,
                    callbacks=[checkpoint],
                    verbose=1)
# save history
with open(f"hyperpara_tuning\\{model_name}_history.pkl", 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
# display results
plot_accuracies(history)

In [None]:
model_name = 'learning_rate_0dot0025'
model = create_model(architecture='LSTM-GRU',
                                    optimizer='Adagrad',
                                    lr=0.0025)
# checkpoint callback to save best model during hyper-parameters tuning step
checkpoint = ModelCheckpoint(f"hyperpara_tuning\\{model_name}.keras", monitor='val_accuracy', save_best_only=True, mode='max')
# fit model
history = model.fit(X_train, 
                    y_train, 
                    validation_data=(X_val, y_val), 
                    epochs=10, 
                    batch_size=16,
                    callbacks=[checkpoint],
                    verbose=1)
# save history
with open(f"hyperpara_tuning\\{model_name}_history.pkl", 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
# display results
plot_accuracies(history)

In [None]:
model_name = 'learning_rate_0dot001'
model = create_model(architecture='LSTM-GRU',
                                    optimizer='Adagrad',
                                    lr=0.001)
# checkpoint callback to save best model during hyper-parameters tuning step
checkpoint = ModelCheckpoint(f"hyperpara_tuning\\{model_name}.keras", monitor='val_accuracy', save_best_only=True, mode='max')
# fit model
history = model.fit(X_train, 
                    y_train, 
                    validation_data=(X_val, y_val), 
                    epochs=10, 
                    batch_size=16,
                    callbacks=[checkpoint],
                    verbose=1)
# save history
with open(f"hyperpara_tuning\\{model_name}_history.pkl", 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
# display results
plot_accuracies(history)