In [2]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

[nltk_data] Downloading package punkt to /Users/Cyrille/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2023-12-19 21:58:50.627773: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Read data

In [3]:
train_df = pd.read_csv('../../data/imdb_train_data_small.csv')
test_df = pd.read_csv('../../data/imdb_test_data_small.csv')

In [4]:
test_df

Unnamed: 0,text,label
0,This movie has bad writing and bad editing. It...,0
1,I'm still laughing- Not! I'm still asking my m...,0
2,While I'm normally a big fan of John Turturro'...,0
3,<br /><br />The author tried to make a Kevin S...,0
4,Oh boy ! It was just a dream ! What a great id...,0
...,...,...
295,My wife and I struggle to find movies like thi...,1
296,"While watching this film recently, I constantl...",1
297,Trust the excellent and accurate Junagadh75 re...,1
298,Valley Girl is an exceptionally well made film...,1


In [14]:
train_df

Unnamed: 0,text,label
0,I rated this a 3. The dubbing was as bad as I ...,0
1,"<br /><br />Cheap-looking and ugly, this film ...",0
2,This film concerns purportedly non-establishme...,0
3,Ho-hum. An inventor's(Horst Buchholz)deadly bi...,0
4,"Definitely not worth the rental, but if you ca...",0
...,...,...
695,This has to be the funniest stand up comedy I ...,1
696,. . . is just as good as the original. Very ne...,1
697,"A quite good film version of the novel, though...",1
698,Maybe the greatest film ever about jazz.<br />...,1


In [15]:
train_df["label"].unique()

array([0, 1])

# Tokenization

Create your own tokenization algorithm. Remember to handle upper/lower case, comma, punctioation and so on.
Each word should hava an integer connected to it. Word as key and integer as value in a dict is one way to do it.

Tensorflow have tokenization models, but try to bild it yourself.

In [5]:
import re
def tokenize(dataset: pd.DataFrame):

    text = dataset["text"]
    text = text.str.lower()
    text = text.str.replace(r'[^a-z0-9\s]', '', regex=True)
    
    words_series = text.str.split()
    
    token_map = {"<UNK>": 0}
    reverse_token_map = {0: "<UNK>"}
    next_token = 1
    
    for word_list in words_series:
        for word in word_list:
            if word not in token_map:
                token_map[word] = next_token
                reverse_token_map[next_token] = word
                next_token += 1
    
    return token_map, reverse_token_map

In [6]:
token_map, reverse_token_map = tokenize(train_df)

In [7]:
token_map

{'<UNK>': 0,
 'i': 1,
 'rated': 2,
 'this': 3,
 'a': 4,
 '3': 5,
 'the': 6,
 'dubbing': 7,
 'was': 8,
 'as': 9,
 'bad': 10,
 'have': 11,
 'seen': 12,
 'plot': 13,
 'yuck': 14,
 'im': 15,
 'not': 16,
 'sure': 17,
 'which': 18,
 'ruined': 19,
 'movie': 20,
 'more': 21,
 'jet': 22,
 'li': 23,
 'is': 24,
 'definitely': 25,
 'great': 26,
 'martial': 27,
 'artist': 28,
 'but': 29,
 'ill': 30,
 'stick': 31,
 'to': 32,
 'jackie': 33,
 'chan': 34,
 'movies': 35,
 'until': 36,
 'somebody': 37,
 'tells': 38,
 'me': 39,
 'jets': 40,
 'english': 41,
 'up': 42,
 'par': 43,
 'br': 44,
 'cheaplooking': 45,
 'and': 46,
 'ugly': 47,
 'film': 48,
 'didnt': 49,
 'even': 50,
 'seem': 51,
 'entertain': 52,
 'kids': 53,
 'in': 54,
 'audience': 55,
 'except': 56,
 'for': 57,
 'one': 58,
 'fairly': 59,
 'amusing': 60,
 'toilet': 61,
 'joke': 62,
 'christopher': 63,
 'lloyd': 64,
 'way': 65,
 'past': 66,
 'his': 67,
 'prime': 68,
 'actually': 69,
 'quite': 70,
 'tiresome': 71,
 'role': 72,
 'although': 73,
 'so

# Remove stopwords

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
def remove_stopwords(text):
    words = text.split()
    fltered_words = [word for word in words if word not in stop_words]
    filtered_text = ' '.join(fltered_words)
    return filtered_text

In [11]:
example_text = "This is a sample sentence, showing off the stop words filtration."
remove_stopwords(example_text)

'This sample sentence, showing stop words filtration.'

# Lemmatization

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
lemmatizer.lemmatize("house"), lemmatizer.lemmatize("houses"), lemmatizer.lemmatize("housing"), lemmatizer.lemmatize("housed")

('house', 'house', 'housing', 'housed')

In [None]:
def lemmatize(word):
    # lemmatize word without using lemmatizer 
    
    return word

# Word embedding and sentiment analysis model
We want to create a model that can say if a movie review is bad or good.

- Preprocess the text
- Convert text to seqiuence of integers
- Create architecture that includes embeddings
- Build and train your models
- Evaluate preformance

Building models from scratch is not something you usually do, but those who would like to dig deeper into the math behind Simple RNN, LSTM and GRU can do it by creating the cells from scratch.

In [None]:
def pad_data(embedded_text):
    # All sentences should be of the same lenght, but if a sentence is shorter than the longest, pad it.
    return padded_text

## RNN with tensorflow modules
[Simple RNN cell](https://www.tensorflow.org/api_docs/python/tf/keras/layers/SimpleRNN)

[Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)

In [None]:
def build_rnn_model():
    return model

## RNN from scratch

In [None]:
class RNNCell(tf.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.Wxh =
        self.Whh =
        self.bh =

    def __call__(self, x, h):
        h_next = 
        return h_next

In [None]:
# RNN Model Class
class MyRNNModel(tf.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim=1, sequence_length=100):
        super().__init__()
        self.embedding =
        self.rnn_cell = RNNCell(embedding_dim, hidden_dim)
        self.Why = 
        self.by = 

    def __call__(self, x):
        x = 
        h = 

        # Process the input sequence
        for t in range(sequence_length):
            x_t = x[:, t, :]
            h = self.rnn_cell(x_t, h)

        y = 
        return tf.sigmoid(y)

In [None]:
def train_step(model, inputs, targets):
    clip_norm = 1.0
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = loss_function(targets, predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(model.trainable_variables)
    return loss

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_train_data, y)).batch(batch_size)
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    epoch_accuracy = 0
    total_batches = 0

    for batch_inputs, batch_targets in train_dataset:
        loss = train_step(model, batch_inputs, batch_targets)
        epoch_loss += loss.numpy()

        # Calculate accuracy
        predictions = model(batch_inputs)
        accuracy = calculate_accuracy(batch_targets, predictions)
        epoch_accuracy += accuracy.numpy()

        total_batches += 1

    avg_loss = epoch_loss / total_batches
    avg_accuracy = epoch_accuracy / total_batches
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')

## LSTM

[LSTM Cell](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTMCell)

In [None]:
def build_lstm_model():
    return model

## LSTM from scrtch

In [None]:
# LSTM Cell Class
class LSTMCell(tf.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        # Gates: input, forget, cell, output
        self.Wi =
        self.Wf =
        self.Wc =
        self.Wo =
        self.bi =
        self.bf =
        self.bc =
        self.bo =

    def __call__(self, x, h, c):
        combined = tf.concat([x, h], 1)

        i = 
        f = 
        o = 
        c_ = 

        c_new = 
        h_new =

        return h_new, c_new

In [None]:
# LSTM Model Class
class MyLSTMModel(tf.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding =
        self.lstm_cell = LSTMCell(embedding_dim, hidden_dim)
        self.Why =
        self.by =

    def __call__(self, x):
        x =
        h =
        c =

        for t in range(sequence_length):
            x_t = x[:, t, :]
            h, c = self.lstm_cell(x_t, h, c)

        y =
        return tf.sigmoid(y)

In [None]:
def train_step(model, inputs, targets):
    clip_norm = 1.0
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = loss_function(targets, predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    clipped_gradients = [tf.clip_by_norm(g, clip_norm) for g in gradients]
    optimizer.apply_gradients(model.trainable_variables)
    return loss

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_train_data, y)).batch(batch_size)
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    epoch_accuracy = 0
    total_batches = 0

    for batch_inputs, batch_targets in train_dataset:
        loss = train_step(model, batch_inputs, batch_targets)
        epoch_loss += loss.numpy()

        # Calculate accuracy
        predictions = model(batch_inputs)
        accuracy = calculate_accuracy(batch_targets, predictions)
        epoch_accuracy += accuracy.numpy()

        total_batches += 1

    avg_loss = epoch_loss / total_batches
    avg_accuracy = epoch_accuracy / total_batches
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')

## GRU
[GRU Cell](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRUCell)

In [None]:
def build_gru_model():
    return model

## GRU from scratch

In [None]:
# GRU Cell Class
class GRUCell(tf.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        # Update gate parameters
        self.Wz =
        self.bz =

        # Reset gate parameters
        self.Wr =
        self.br =

        # Candidate hidden state parameters
        self.Wh =
        self.bh =
        
    def __call__(self, x, h):
        combined = tf.concat([x, h], 1)

        # Update gate
        z =

        # Reset gate
        r =

        # Candidate hidden state
        combined_reset =
        h_candidate =

        # New hidden state
        h_new =

        return h_new

In [None]:
# GRU Model Class
class MyGRUModel(tf.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding =
        self.gru_cell =
        self.Why =
        self.by =

    def __call__(self, x):
        x =
        h =

        for t in range(sequence_length):
            x_t = x[:, t, :]
            h = self.gru_cell(x_t, h)

        y =
        return tf.sigmoid(y)

In [None]:
def train_step(model, inputs, targets):
    clip_norm = 1.0
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = loss_function(targets, predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(model.trainable_variables)
    return loss

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_train_data, y)).batch(batch_size)
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    epoch_accuracy = 0
    total_batches = 0

    for batch_inputs, batch_targets in train_dataset:
        loss = train_step(model, batch_inputs, batch_targets)
        epoch_loss += loss.numpy()

        # Calculate accuracy
        predictions = model(batch_inputs)
        accuracy = calculate_accuracy(batch_targets, predictions)
        epoch_accuracy += accuracy.numpy()

        total_batches += 1

    avg_loss = epoch_loss / total_batches
    avg_accuracy = epoch_accuracy / total_batches
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')