In [46]:
import time

import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)

import nltk
from nltk.corpus import wordnet 
import string
import re
from nltk.stem import WordNetLemmatizer 
import tensorflow as tf

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    Input,
    Flatten,
    Conv1D,
    MaxPooling1D,
    concatenate,
    Conv2D,
    MaxPooling2D
)


from tensorflow.keras.utils import plot_model
from tensorflow.keras import utils

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence


from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.regularizers import l2



In [3]:
train = pd.read_csv('/Users/dianaterraza/Desktop/NLP/Data/train_processed.csv', on_bad_lines='skip')
test = pd.read_csv('/Users/dianaterraza/Desktop/NLP/Data/test_processed.csv', on_bad_lines= 'skip')

In [5]:
train.head(3)

Unnamed: 0,review,sentiment,word_counts,review_processed,sentiment_encoded
0,If you are looking for a movie that doesn't ta...,positive,271,look movie doesnt take seriously haggard must ...,1
1,"THE BEAVER TRILOGY is, without a doubt, one of...",positive,217,beaver trilogy without doubt one brilliant fil...,1
2,"If Andrei Tarkovsky had been a hack, he would ...",negative,118,andrei tarkovsky hack would direct mother son ...,0


In [19]:
def create_tokenizer(corpus, max_len):
    tokenizer = Tokenizer(char_level=False, # if i put true the tokens are characters and we want words
                          oov_token = True) 
    tokenizer.fit_on_texts(corpus)
    
    return tokenizer 

## Encode the text into the sequence 

def encode_text(tokenizer, corpus, max_len):
    encoded = tokenizer.texts_to_sequences(corpus) # this will encode the text into the sequence    
    
    padded = sequence.pad_sequences(encoded, maxlen=max_len, padding='post')  # we add 0 to the words 
    # the sequence to the same length 
    
    return padded 
    

In [20]:
train['length'] = train['review_processed'].apply(len)
test['length'] = test['review_processed'].apply(len)

In [21]:
train.head(3)

Unnamed: 0,review,sentiment,word_counts,review_processed,sentiment_encoded,length
0,If you are looking for a movie that doesn't ta...,positive,271,look movie doesnt take seriously haggard must ...,1,782
1,"THE BEAVER TRILOGY is, without a doubt, one of...",positive,217,beaver trilogy without doubt one brilliant fil...,1,735
2,"If Andrei Tarkovsky had been a hack, he would ...",negative,118,andrei tarkovsky hack would direct mother son ...,0,426


In [22]:
max_len = train['length'].max()

In [23]:
tokenizer = create_tokenizer(corpus=train['review_processed'].tolist(),
                             max_len=max_len)

In [24]:
seq = train['review_processed'].iloc[0].split()
print(seq)

['look', 'movie', 'doesnt', 'take', 'seriously', 'haggard', 'must', 'say', 'write', 'anything', 'see', 'cky', 'camp', 'kill', 'video', 'movie', 'likely', 'wont', 'funny', 'advice', 'watch', 'clip', 'video', 'bam', 'friend', 'make', 'haggard', 'take', 'seriously', 'never', 'purpose', 'throughout', 'movie', 'random', 'moment', 'nothing', 'plot', 'may', 'get', 'annoy', 'nothing', 'control', 'even', 'plot', 'stay', 'focus', 'story', 'ryan', 'dunns', 'character', 'unfold', 'quite', 'nicely', 'plot', 'told', 'base', 'true', 'story', 'partof', 'ryan', 'dunns', 'exgirlfriend', 'brandon', 'dicamillo', 'far', 'best', 'character', 'movie', 'lot', 'talent', 'know', 'make', 'people', 'laugh', 'stole', 'movie', 'ask', 'overall', 'love', 'movie', 'simplicity', 'straight', 'weirdness', 'bam', 'movie', 'people', 'go', 'normal', 'haggard', 'fill', 'hilarious', 'quote', 'friend', 'constantly', 'use', 'since', 'first', 'time', 'saw', 'ive', 'see', 'movie', 'time', 'still', 'find', 'new', 'thing', 'every',

And now we encoded: I take the tokenizer that I train. They are the indices of the words 

In [25]:
seq_encoded = tokenizer.texts_to_sequences([seq])
print(seq_encoded)

[[24, 3, 70, 32, 494, 7401, 129, 26, 91, 152, 8, 10370, 917, 106, 283, 3, 1081, 379, 79, 1645, 14, 1809, 283, 4414, 117, 6, 7401, 32, 494, 50, 925, 370, 3, 1263, 161, 80, 45, 110, 9, 475, 80, 758, 17, 45, 462, 533, 18, 1543, 22346, 13, 3117, 100, 1528, 45, 456, 317, 204, 18, 57216, 1543, 22346, 7839, 7402, 23857, 147, 54, 13, 3, 62, 414, 35, 6, 27, 144, 2989, 3, 422, 339, 37, 3, 3983, 653, 8194, 4414, 3, 27, 15, 1057, 7401, 590, 495, 1650, 117, 1116, 71, 149, 33, 11, 125, 111, 8, 3, 11, 58, 59, 87, 40, 86, 11, 599, 12, 184, 42722, 106, 87, 434, 8978, 34, 15, 3, 193, 1094, 153, 3117, 1307]]


In [26]:
tokenizer.texts_to_sequences([['book']])

[[159]]

In [27]:
X_train = encode_text(tokenizer,
                      corpus = train['review_processed'].tolist(),
                      max_len = max_len)

In [34]:
X_test = encode_text(tokenizer,
                      corpus = test['review_processed'].tolist(),
                      max_len = max_len)

In [35]:
X_train

array([[1188,   20,   35, ...,    0,    0,    0],
       [ 591,   56,   39, ...,    0,    0,    0],
       [7622, 1684, 1693, ...,    0,    0,    0],
       ...,
       [4989,  867,   85, ...,    0,    0,    0],
       [9409,   75,  120, ...,    0,    0,    0],
       [ 110, 1338,  617, ...,    0,    0,    0]], dtype=int32)

The ceros is because we are using padding

In [36]:
tokenizer.word_counts

OrderedDict([('look', 14373),
             ('movie', 74074),
             ('doesnt', 6663),
             ('take', 12859),
             ('seriously', 1479),
             ('haggard', 52),
             ('must', 4635),
             ('say', 14184),
             ('write', 5631),
             ('anything', 4229),
             ('see', 30557),
             ('cky', 31),
             ('camp', 779),
             ('kill', 5202),
             ('video', 2552),
             ('likely', 649),
             ('wont', 1948),
             ('funny', 6254),
             ('advice', 393),
             ('watch', 20400),
             ('clip', 350),
             ('bam', 111),
             ('friend', 4790),
             ('make', 32876),
             ('never', 9618),
             ('purpose', 772),
             ('throughout', 1971),
             ('random', 541),
             ('moment', 3959),
             ('nothing', 6201),
             ('plot', 9842),
             ('may', 5015),
             ('get', 25992),
          

In [37]:
vocab_size = len(tokenizer.word_index) + 1 + 1 # we add 1 for the padding and 1 for the out of vocabulary token


In [38]:
X_train.shape

(12500, 8307)

In [39]:
X_test.shape

(12500, 8307)

In [None]:
'''
def CNN(max_length, vocab_size):
    kernel = 'normal'
    embedding_size = 100 # the size of the vector space in which words will be embedded
    drop_out_value = 0.2 # the fraction of the units to drop for the linear transformation of the inputs
    kernel_size = 6 # the length of the 1D convolution window
    
    model = Sequential(name = 'sequential')
    
    model.add (Embedding(input_dim = vocab_size, 
                         output_dim = embedding_size, 
                         input_length = max_length,
                         name = 'embedding'))
    
    model.add(Conv1D(filters = 64, kernel_size = kernel_size, activation = 'relu', name = 'conv_1'))
    model.add(Dropout(drop_out_value, name = 'dropout_1'))
    model.add(MaxPooling1D(pool_size = 2))
    
    model.add(Conv1D(filters = 256, kernel_size = kernel_size, activation = 'relu', name = 'conv_2'))
    model.add(Dropout(drop_out_value, name = 'dropout_2'))
    model.add(MaxPooling1D(pool_size = 2))
    
    model.add(Flatten())
    
    model.add(Dense(1, activation = 'sigmoid', kernel_initializer = kernel, name = 'dense_2'))
    #sigmoid because is used for a binary classification
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model
'''

In [None]:
# Define a function to create a Convolutional Neural Network (CNN) for text classification.
# - `max_length`: The length of the input sequences (padded/truncated).
# - `vocab_size`: The size of the vocabulary, including padding and OOV tokens.
def CNN(max_length, vocab_size):
    # Initialize kernel type, embedding size, dropout value, and kernel size.
    kernel = 'normal'               # Kernel initializer for dense layers.
    embedding_size = 100            # Dimension of the word embedding vectors.
    drop_out_value = 0.5            # Dropout rate to reduce overfitting.
    kernel_size = 6                 # Size of the kernel for the convolutional layers.

    # Create a sequential model.
    model = Sequential(name='sequential')

    # Add an embedding layer:
    # - `input_dim=vocab_size`: Vocabulary size for the embedding layer.
    # - `output_dim=embedding_size`: Size of each word embedding vector.
    # - `input_length=max_length`: Length of each input sequence (padded/truncated).
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_size,
                        input_length=max_length,
                        name='embedding'
                        )
    )

    # Add the first 1D convolutional layer:
    # - `filters=64`: Number of output filters (feature maps).
    # - `kernel_size=kernel_size`: Size of the convolutional window.
    # - `activation='relu'`: Activation function to apply non-linearity.
    model.add(Conv1D(filters=64, kernel_size=kernel_size, activation='relu', name='conv_1'))

    # Add the first dropout layer:
    # - `rate=drop_out_value`: Randomly set 50% of input units to 0 during training.
    model.add(Dropout(drop_out_value, name='dropout_1'))

    # Add the first max-pooling layer:
    # - `pool_size=2`: Reduces the dimensionality of the feature maps by taking the maximum value in each pool.
    model.add(MaxPooling1D(pool_size=2))

    # Add the second 1D convolutional layer:
    # - `filters=256`: Number of output filters.
    # - `kernel_size=kernel_size`: Size of the convolutional window.
    # - `activation='relu'`: Activation function.
    model.add(Conv1D(filters=256, kernel_size=kernel_size, activation='relu', name='conv_2'))

    # Add the second dropout layer.
    model.add(Dropout(drop_out_value, name='dropout_2'))

    # Add the second max-pooling layer.
    model.add(MaxPooling1D(pool_size=2))

    # Flatten the output:
    # - Converts the multi-dimensional feature maps into a single vector for the dense layer.
    model.add(Flatten())

    # Add the final output layer:
    # - `units=1`: Single output for binary classification (positive/negative sentiment).
    # - `activation='sigmoid'`: Sigmoid activation to produce probabilities between 0 and 1.
    # - `kernel_initializer=kernel`: Initializer for weights.
    model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel, name='dense_2'))
    
    #sigmoid because is used for a binary classification
    # multiclass classification would use softmax
    # regression would use linear activation

    # Compile the model:
    # - `loss='binary_crossentropy'`: Loss function for binary classification.
    # - `optimizer='adam'`: Adaptive learning rate optimization algorithm.
    # - `metrics=['accuracy']`: Track accuracy during training and evaluation.
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Return the compiled model.
    return model


Define the target Variables: 

In [56]:
y_train = train['sentiment_encoded'].values
y_test = test['sentiment_encoded'].values

In [57]:
cnn_model = CNN(max_len, vocab_size)



In [58]:
early_stopping = EarlyStopping(monitor = 'val_loss', 
                               patience = 3,
                               mode = 'max',
                               restore_best_weights = True) # we want to maximize the validation accuracy when we stop earlier

In [59]:
tf.keras.backend.clear_session()

In [60]:
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [61]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

In [62]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Sample in X_train:", X_train[0]) 

X_train shape: (12500, 8307)
y_train shape: (37500,)
Sample in X_train: [1188.   20.   35. ...    0.    0.    0.]


In [63]:
cnn_history = cnn_model.fit(X_train, y_train,
                            epochs = 8,
                            batch_size = 64,
                            validation_split = 0.5,
                            callbacks = [early_stopping])

Epoch 1/8
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 2s/step - accuracy: 0.5009 - loss: 0.9747 - val_accuracy: 0.4891 - val_loss: 0.6942
Epoch 2/8
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 2s/step - accuracy: 0.5981 - loss: 0.6749 - val_accuracy: 0.4986 - val_loss: 0.6961
Epoch 3/8
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 2s/step - accuracy: 0.7535 - loss: 0.5480 - val_accuracy: 0.5059 - val_loss: 0.7365
Epoch 4/8
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 2s/step - accuracy: 0.9654 - loss: 0.1403 - val_accuracy: 0.4974 - val_loss: 0.8691
Epoch 5/8
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 2s/step - accuracy: 0.9947 - loss: 0.0282 - val_accuracy: 0.5030 - val_loss: 0.9883
Epoch 6/8
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 2s/step - accuracy: 0.9988 - loss: 0.0104 - val_accuracy: 0.4930 - val_loss: 1.0947
Epoch 7/8
[1m98/98[0m [32m━━━━━━━━━━━