In [15]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import tensorflow as tf
import sentencepiece as spm

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

from tokenization_and_embedding import TokenAndPositionEmbedding
from transformer_block import TransformerBlock

# Dataset https://www.kaggle.com/code/aadyasingh55/model-training-of-tweet-emotion-classification

In [16]:
df = pd.read_parquet('data.parquet')

print(df.head())
print(df['label'])

# Map the labels to emotion names for better readability (Optional)
emotion_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
df['emotion'] = df['label'].map(emotion_map)

# Preview the updated dataframe
print(df[['text', 'emotion']].head())

                                                text  label
0  i feel awful about it too because it s my job ...      0
1                              im alone i feel awful      0
2  ive probably mentioned this before but i reall...      1
3           i was feeling a little low few days back      0
4  i beleive that i am much more sensitive to oth...      2
0         0
1         0
2         1
3         0
4         2
         ..
416804    1
416805    4
416806    0
416807    1
416808    0
Name: label, Length: 416809, dtype: int64
                                                text  emotion
0  i feel awful about it too because it s my job ...  sadness
1                              im alone i feel awful  sadness
2  ive probably mentioned this before but i reall...      joy
3           i was feeling a little low few days back  sadness
4  i beleive that i am much more sensitive to oth...     love


In [17]:
### Constants
EIGHT_THOUSAND = 8000
SIXTEEN_THOUSAND = 16000
TRAIN_TEXT = 'train_text.txt'
LABEL_FILE = 'train_labels.txt'
SAMPLE_TEXT = "This is a sample sentence used for BPE bits. It can be up to 128 characters long."
TOKEN_LENGTH = 128

In [18]:
#BASIC PRE PROCESSING
print(df.shape)
print(df.columns)
print(f"Data Types: {df.dtypes}")
print(f"Empty values {df.isna().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")
print(df["label"].value_counts())
print(df["label"].value_counts(normalize=True).round(3))

(416809, 3)
Index(['text', 'label', 'emotion'], dtype='object')
Data Types: text       object
label       int64
emotion    object
dtype: object
Empty values text       0
label      0
emotion    0
dtype: int64
Duplicates: 686
label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64
label
1    0.338
0    0.291
3    0.138
4    0.114
2    0.083
5    0.036
Name: proportion, dtype: float64


In [19]:
from sklearn.model_selection import train_test_split

X = df["text"].astype(str)
y = df["label"].astype(int)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Second split to obtain validation as well as test set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Feature Distribution (After Stratification)")
print(f"Train(%): {y_train.value_counts(normalize=True).round(5).sort_index() * 100}\n")
print(f"Val(%): {y_val.value_counts(normalize=True).round(5).sort_index() * 100}\n")
print(f"Test(%): {y_test.value_counts(normalize=True).round(5).sort_index() * 100}\n")

Feature Distribution (After Stratification)
Train(%): label
0    29.075
1    33.844
2     8.290
3    13.752
4    11.447
5     3.592
Name: proportion, dtype: float64

Val(%): label
0    29.076
1    33.845
2     8.289
3    13.752
4    11.446
5     3.592
Name: proportion, dtype: float64

Test(%): label
0    29.076
1    33.845
2     8.292
3    13.750
4    11.446
5     3.592
Name: proportion, dtype: float64



In [20]:
text_lengths = X_train.str.split().apply(len)
print(text_lengths.describe())

KeyboardInterrupt: 

In [None]:
X_train_clean = X_train.str.strip()
X_val_clean = X_val.str.strip()
X_test_clean = X_test.str.strip()

X_train_clean.to_csv('train_text.txt', index=False, header=False)
print(f"Wrote {len(X_train_clean)} to file")

Wrote 333447 to file


In [None]:
### Byte Pair Encoding (BPE)
spm.SentencePieceTrainer.train(
    input=TRAIN_TEXT,
    model_prefix='m_bpe',
    vocab_size=EIGHT_THOUSAND,
    model_type='bpe'
)

sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load('m_bpe.model')

print(f"Vocab size: {sp_bpe.get_piece_size()}")
print(f"BPE Pieces: {sp_bpe.encode(SAMPLE_TEXT, out_type=str)[:20]}")

def encode_texts(sp, texts):
  """
  Encode text, create token IDs and attention masks
  """
  input_ids = []
  attention_masks = []

  for text in texts:
    ids = sp.encode(text, out_type=int)

    # reduce tokens to 128 (we have some at 178, but minimal)
    ids = ids[:TOKEN_LENGTH]
    attention_mask = [1] * len(ids)
    
    # pad to max_len
    pad_id = 3
    while len(ids) < TOKEN_LENGTH:
      ids.append(pad_id)
      ## add padding to the attention mask to ensure each token is 128 bits
      attention_mask.append(0)

    input_ids.append(ids)
    attention_masks.append(attention_mask)

  return np.array(input_ids, dtype=np.int32), np.array(attention_masks, dtype=np.int32)


Vocab size: 8000
BPE Pieces: ['‚ñÅ', 'T', 'h', 'is', '‚ñÅis', '‚ñÅa', '‚ñÅsam', 'ple', '‚ñÅsentence', '‚ñÅused', '‚ñÅfor', '‚ñÅ', 'BPE', '‚ñÅbits', '.', '‚ñÅ', 'I', 't', '‚ñÅcan', '‚ñÅbe']


In [None]:
### BPE Generate data

Xb_train, att_b_mask_train = encode_texts(sp_bpe, X_train_clean.tolist())
Xb_val, att_b_mask_val = encode_texts(sp_bpe, X_val_clean.tolist())
Xb_test, att_b_mask_test = encode_texts(sp_bpe, X_test_clean.tolist())

print(f"BPE train shape: {Xb_train.shape, att_b_mask_train.shape}")
print(f"BPE val shape: {Xb_val.shape, att_b_mask_val.shape}")
print(f"BPE test shape: {Xb_test.shape, att_b_mask_test.shape}")


BPE train shape: ((333447, 128), (333447, 128))
BPE val shape: ((41681, 128), (41681, 128))
BPE test shape: ((41681, 128), (41681, 128))


In [None]:
### BPE TensorFlow input 
batch_size = 64

def make_dataset(X_ids, X_mask, y):
  data = tf.data.Dataset.from_tensor_slices(((X_ids, X_mask), y))
  data = data.shuffle(10000, reshuffle_each_iteration=True)
  data = data.batch(batch_size).prefetch(tf.data.AUTOTUNE)
  return data

bpe_train_data = make_dataset(Xb_train, att_b_mask_train, y_train.values)

# tf.data.AUTOTUNE allows TensorFlow to automatically determine the optimal number of parallel calls for data loading and preprocessing
bpe_val_data = tf.data.Dataset.from_tensor_slices(((Xb_val, att_b_mask_val), y_val.values)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
bpe_test_data = tf.data.Dataset.from_tensor_slices(((Xb_test, att_b_mask_test), y_test.values)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
### BPE - Balance classes
classes = np.sort(y_train.unique())
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)

class_weights = {int(k): float(v) for k, v in zip(classes, weights)}
print(f"Class weights: {class_weights}")


Class weights: {0: 0.5732343809631868, 1: 0.4924503557725537, 2: 2.010436638570343, 3: 1.2119880490251669, 4: 1.455973277443018, 5: 4.639714476540324}


In [None]:
### BPE Model Definition
def build_transformer_classifier(vocab_size, num_classes, max_len=128, num_heads=4, feed_forward_dim=256, rate=0.1):
    input_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    
    embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embed_dim=128)
    X = embedding_layer(input_ids)
    X = TransformerBlock(embed_dim=128, num_heads=num_heads, feed_forward_dim=feed_forward_dim, rate=rate)(X, training=True, mask=input_mask)

    X = tf.keras.layers.GlobalAveragePooling1D()(X)
    X = tf.keras.layers.Dropout(rate)(X)
    
    output = tf.keras.layers.Dense(num_classes, activation='softmax')(X)
    
    model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=output)
    return model


In [None]:
### BPE Tokenizer and Vocab size

num_classes = y.nunique()

bpe_vocab_size = sp_bpe.get_piece_size()

print(f"BPE Vocab size: {bpe_vocab_size}")
print(f"Number of classes: {num_classes}")

BPE Vocab size: 8000
Number of classes: 6


In [21]:
### BPE Train ü•≥ü•≥ü•≥

bpe_model = build_transformer_classifier(vocab_size=bpe_vocab_size, num_classes=num_classes, max_len=TOKEN_LENGTH)

bpe_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]

history = bpe_model.fit(bpe_train_data, validation_data=bpe_val_data, epochs=5, class_weight=class_weights, callbacks=callbacks)


Epoch 1/5




[1m5211/5211[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m946s[0m 181ms/step - accuracy: 0.8729 - loss: 0.2798 - val_accuracy: 0.9050 - val_loss: 0.1954
Epoch 2/5
[1m5211/5211[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m877s[0m 168ms/step - accuracy: 0.9051 - loss: 0.1762 - val_accuracy: 0.9089 - val_loss: 0.1857
Epoch 3/5
[1m5211/5211[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m867s[0m 166ms/step - accuracy: 0.9079 - loss: 0.1681 - val_accuracy: 0.9053 - val_loss: 0.2020
Epoch 4/5
[1m5211/5211[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m849s[0m 163ms/step - accuracy: 0.9111 - loss: 0.1606 - val_accuracy: 0.9056 - val_loss: 0.1942
Epoch 5/5
[1m5211/5211[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m852s[0m 164ms/step - accuracy: 0.9143 - loss: 0.1547 - val_accuracy: 0

In [28]:
# Predict
y_pred_probs = bpe_model.predict(bpe_test_data)
y_pred = np.argmax(y_pred_probs, axis=1)

print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))


[1m652/652[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m35s[0m 53ms/step
              precision    recall  f1-score   support

           0     0.9713    0.9228    0.9465     12119
           1     0.9839    0.8724    0.9248     14107
           2     0.7080    0.9835    0.8233      3456
           3     0.9017    0.9204    0.9110      5731
           4     0.8605    0.8795    0.8699      4771
           5     0.6731    0.9586    0.7909      1497

    accuracy                         0.9068     41681
   macro avg     0.8498    0.9229    0.8777     41681
weighted avg     0.9208    0.9068    0.9097     41681

[[11184    88    85   413   308    41]
 [  144 12307  1250    97    95   214]
 [   17    14  3399     8     4    14]
 [   92    52    39  5275   264     9]
 [   60    28    20    48  4196   419]
 [   17    19     8     9     9  1435]]
