In [2]:
import math
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
train_df.columns

Index(['review', 'sentiment'], dtype='object')

In [4]:
X = train_df['review']
y = train_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [5]:
X_train

4896     When will people learn that some movies are ma...
12778    This story starts at the end So the film's ope...
23856    There are some movies that are loved by almost...
19550    This movie felt so real. I actually felt all o...
19500    This review is based on the Producer's Cut 'Ha...
                               ...                        
15480    J niksen vuosi is one of Jarva's most politica...
15952    First of all - I hardly ever watch Swedish mov...
13781    If Monte Hellman's legendary early 's road mov...
3441     One of my all-time favorite so-laughably-lousy...
17513    This film had so much promise. I was very exci...
Name: review, Length: 22500, dtype: object

In [6]:
MAX_LENGTH = math.ceil((X_train.apply(lambda x: len(str(x).split())).mean()))+2
MAX_LENGTH

231

In [7]:
PAD_TOKEN = "<|pad|>"
EOS_TOKEN = "<|endoftext|>"

# this will download and initialize the pre trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2",
    pad_token=PAD_TOKEN,
    eos_token=EOS_TOKEN,
    max_length=MAX_LENGTH,
    is_split_into_words=True)

In [8]:
X_train = [str(ex) + EOS_TOKEN for ex in X_train]
X_test = [str(ex) + EOS_TOKEN for ex in X_test]

In [9]:
X_train_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)['input_ids'] for x in X_train]
X_test_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)['input_ids'] for x in X_test]

X_train_in = tf.squeeze(tf.convert_to_tensor(X_train_), axis=1)
X_test_in = tf.squeeze(tf.convert_to_tensor(X_test_), axis=1)

2024-02-25 12:39:07.143537: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-25 12:39:07.145563: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-25 12:39:07.146690: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-25 12:39:07.148231: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [10]:
X_train_in[0]

<tf.Tensor: shape=(231,), dtype=int32, numpy=
array([ 2215,   481,   661,  2193,   326,   617,  6918,   389,   925,
         329,  1257,   290,   389,   407,  6646,   503,   284,  1487,
         262,   995,  1002,   345, 18996,   428,   788,  1607,   284,
         423,   339,  1686,   286,  1257,   981,  4964,   366, 17798,
         290, 11396,   338, 33418,  7002,   526,   770,   318,   257,
        3807,   326,   318,   339,  1686,   286,  1257,   284,  2342,
          11,  3873, 42357,   290,  4422,   787,   257,  1049,   319,
        3159,  1074,   302, 14619,   511,  3435,   422,   366, 17798,
         290, 11396,   338,  6275,  8855,     1,   351,   772,   517,
         705,  7635,     6,   788,   484,   550,   287,   336,  3807,
          13,   632,   338,   407, 10701,  3783,   475,   340,   338,
        1049,   329,   257,  6487,    11,   262,  3435,   852,  4457,
         588,    12,   540,   290,   262,  1621,    12,  1370,   852,
         523,  7702,   345,   423,   284,  6

In [11]:
X_train_mask_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)["attention_mask"] for x in X_train]
X_test_mask_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)["attention_mask"] for x in X_test]

X_train_mask = tf.squeeze(tf.convert_to_tensor(X_train_mask_), axis=1)
X_test_mask = tf.squeeze(tf.convert_to_tensor(X_test_mask_), axis=1)

In [12]:
model = TFGPT2Model.from_pretrained("gpt2", use_cache=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id)
model.training = True

2024-02-25 12:40:13.361492: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [13]:
model.resize_token_embeddings(len(tokenizer))

<keras.layers.embeddings.Embedding at 0x7efdc0568550>

In [14]:
for layer in model.layers:
    layer.trainable = False

In [15]:
model.summary()

Model: "tfgpt2_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124440576 
 r)                                                              
                                                                 
Total params: 124,440,576
Trainable params: 0
Non-trainable params: 124,440,576
_________________________________________________________________


In [16]:
input = tf.keras.layers.Input(shape=(None,), dtype='int32')
mask = tf.keras.layers.Input(shape=(None,), dtype='int32')
x = model(input, attention_mask=mask)
#x = x.last_hidden_state[:, -1]
x = tf.reduce_mean(x.last_hidden_state, axis=1)
x = tf.keras.layers.Dense(16, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
output = tf.keras.layers.Dense(2, activation='softmax')(x)

In [17]:
clf = tf.keras.Model([input, mask], output)
clf.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 tfgpt2_model (TFGPT2Model)     TFBaseModelOutputWi  124440576   ['input_1[0][0]',                
                                thPastAndCrossAtten               'input_2[0][0]']                
                                tions(last_hidden_s                                               
                                tate=(None, None, 7                                           

In [18]:
base_learning_rate = 0.0005
optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate)
loss=tf.keras.losses.BinaryCrossentropy()
#loss=tf.keras.losses.SparseCategoricalCrossentropy()

clf.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [19]:
callbacks = tf.keras.callbacks.EarlyStopping(
        monitor="accuracy", verbose=1, patience=3, restore_best_weights=True)

In [20]:
def map_sentiment(value):
  if value == 'negative':
    return 0
  if value == 'positive':
    return 1

In [21]:
y_train_ = y_train.map(map_sentiment)
y_test_ = y_test.map(map_sentiment)

In [22]:
y_train_

4896     1
12778    0
23856    1
19550    1
19500    1
        ..
15480    1
15952    1
13781    1
3441     1
17513    0
Name: sentiment, Length: 22500, dtype: int64

In [23]:
y_train_in = tf.constant(y_train_, dtype=tf.int32)
y_test_in = tf.constant(y_test_, dtype=tf.int32)

In [24]:
y_train_in

<tf.Tensor: shape=(22500,), dtype=int32, numpy=array([1, 0, 1, ..., 1, 1, 0], dtype=int32)>

In [25]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the data
y_train_in_one_hot = encoder.fit_transform(y_train_in)



ValueError: Expected 2D array, got 1D array instead:
array=[1 0 1 ... 1 1 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
from keras import backend as K
K.clear_session()

In [None]:
tf.config.experimental_run_functions_eagerly(True)


In [None]:
history = clf.fit([X_train_in, X_train_mask], y_train_in_one_hot, epochs=5, batch_size=8, validation_split=0.2, callbacks=callbacks)