In [3]:
import pandas as pd
import tensorflow as tf

from notebooks.prepare.mnist_exercises import early_stopping

In [4]:
df = pd.read_csv('kaggle_sentiment/tweet_sentiment_train.csv', encoding='utf-8', encoding_errors='replace')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km�)   27481 non-null  float64
 9   Density (P/Km�)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


In [5]:
df[:3]

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km�),Density (P/Km�)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18


In [5]:
import string
import re


def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(
            stripped_html,
            '[%s]' % re.escape(string.punctuation),
            ''
            )

In [6]:
df['text'] = df['text'].astype(str)

In [7]:
sentiment_mapping = {'negative': 2, 'neutral': 0, 'positive': 1}
df['sentiment'] = df['sentiment'].replace(sentiment_mapping)
df[['text', 'sentiment']].head(10)

  df['sentiment'] = df['sentiment'].replace(sentiment_mapping)


Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",0
1,Sooo SAD I will miss you here in San Diego!!!,2
2,my boss is bullying me...,2
3,what interview! leave me alone,2
4,"Sons of ****, why couldn`t they put them on t...",2
5,http://www.dothebouncy.com/smf - some shameles...,0
6,2am feedings for the baby are fun when he is a...,1
7,Soooo high,0
8,Both of you,0
9,Journey!? Wow... u just became cooler. hehe....,1


In [8]:
texts = df['text'].values
labels = df['sentiment'].values
# Create the dataset
dataset = tf.data.Dataset.from_tensor_slices((texts, labels))

In [9]:
dataset = dataset.map(lambda x, y: ((custom_standardization(x), y)))
for text, label in dataset.take(2):
    print(text.numpy())
    print(label.numpy())

b' id have responded if i were going'
0
b' sooo sad i will miss you here in san diego'
2


2025-05-09 22:07:24.919524: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [10]:
train_size = 22_000
val_size = 2_000
test_size = len(df) - train_size - val_size

In [21]:
dataset = dataset.shuffle(train_size + val_size)
train_ds = dataset.take(train_size)
val_ds = dataset.skip(train_size).take(val_size)
test_ds = dataset.skip(train_size + val_size)

In [22]:
train_ds = train_ds.batch(128).cache().prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(128).cache().prefetch(tf.data.AUTOTUNE)

In [13]:
max_sequence_length = 0
for text, label in dataset:
    if len(text.numpy()) > max_sequence_length:
        max_sequence_length = len(text.numpy())
print(max_sequence_length)

143


2025-05-09 22:07:33.138100: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [15]:
import keras_hub

bert_name = "bert_tiny_en_uncased"
classifier = keras_hub.models.TextClassifier.from_preset(bert_name, sequence_lengths=256, num_classes=3)
classifier.build(input_shape=(None, 256))
classifier.summary()

In [23]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
        min_delta=0.02,
        restore_best_weights=True
        )
classifier.fit(train_ds, epochs=5, validation_data=val_ds, verbose=2)

Epoch 1/5
172/172 - 39s - 224ms/step - loss: 0.7795 - sparse_categorical_accuracy: 0.6810 - val_loss: 0.7195 - val_sparse_categorical_accuracy: 0.7025
Epoch 2/5
172/172 - 38s - 219ms/step - loss: 0.6945 - sparse_categorical_accuracy: 0.7141 - val_loss: 0.6503 - val_sparse_categorical_accuracy: 0.7310
Epoch 3/5
172/172 - 38s - 220ms/step - loss: 0.6415 - sparse_categorical_accuracy: 0.7391 - val_loss: 0.6052 - val_sparse_categorical_accuracy: 0.7495
Epoch 4/5
172/172 - 38s - 221ms/step - loss: 0.6062 - sparse_categorical_accuracy: 0.7531 - val_loss: 0.5774 - val_sparse_categorical_accuracy: 0.7575
Epoch 5/5
172/172 - 38s - 219ms/step - loss: 0.5775 - sparse_categorical_accuracy: 0.7648 - val_loss: 0.5490 - val_sparse_categorical_accuracy: 0.7840


<keras.src.callbacks.history.History at 0x3078117f0>

In [24]:
classifier.evaluate(test_ds.batch(16))

[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - loss: 0.5393 - sparse_categorical_accuracy: 0.7972


[0.5389913320541382, 0.7905774116516113]

In [34]:
import keras
import keras_hub

bert_name = "bert_tiny_en_uncased"
preprocess_layer = keras_hub.models.BertPreprocessor.from_preset(bert_name, trainable=False)
backbone = keras_hub.models.Backbone.from_preset(bert_name, trainable=False)
#tokenizer = keras_nlp.models.Tokenizer.from_preset(bert_name)

text_input = keras.Input(shape=(), dtype=tf.string, name='text')
outputs = backbone(preprocess_layer(text_input))
net = outputs['pooled_output'][:, :]
net = keras.layers.Flatten()(net)
net = keras.layers.Dropout(0.1)(net)
net = keras.layers.Dense(512, activation='relu')(net)
net = keras.layers.Dense(3, activation='softmax')(net)
model = keras.Model(text_input, net, name='bert_sentiment_model_1')
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
model.compile(optimizer="adam", loss=loss_fn, metrics=['accuracy'])
model.summary()

"""
Die Ergebnisse sind nicht besonders gut. Wenn im backbone trainable=True gesetzt wird, sind die Ergebnisse deutlich besser (auch weil mehr Parameter trainiert werden können).
"""


'\nDie Ergebnisse sind nicht besonders gut. Wenn im backbone trainable=True gesetzt wird, sind die Ergebnisse deutlich besser (auch weil mehr Parameter trainiert werden können).\n'

In [35]:
model.fit(train_ds, epochs=5, validation_data=val_ds, callbacks=[early_stopping_cb], verbose=2)

Epoch 1/5
172/172 - 16s - 93ms/step - accuracy: 0.4386 - loss: 1.0672 - val_accuracy: 0.4465 - val_loss: 1.0245
Epoch 2/5
172/172 - 14s - 79ms/step - accuracy: 0.4621 - loss: 1.0365 - val_accuracy: 0.4480 - val_loss: 1.0466
Epoch 3/5
172/172 - 14s - 84ms/step - accuracy: 0.4668 - loss: 1.0353 - val_accuracy: 0.4790 - val_loss: 1.0089


<keras.src.callbacks.history.History at 0x5def78170>