## Eindopdracht 3: Moderne Netwerkarchitecturen
Onno de Jong, 1909878

In [2]:
import numpy as np
import tensorflow as tf

In [None]:
with open("data/iliad.txt", "r", encoding="utf-8") as f:
    iliad = f.read().lower()

In [8]:
iliad[:200]

'the project gutenberg ebook of the iliad\n    \nthis ebook is for the use of anyone anywhere in the united states and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever.'

In [20]:
class Tokenizer:
    tokens: dict[str, int]
    reverse_tokens: dict[int, str]
    n: int

    def __init__(self):
        self.tokens = {}
        self.reverse_tokens = {}
        self.n = 0

    def train(self, data):
        t = []
        for x in data:
            k = self.tokens.keys()
            if x not in t:
                t.append(x)

        for index, x in enumerate(sorted(t)):
            self.tokens[x] = index
            self.reverse_tokens[index] = x

        self.n = index + 1

    def tokenize(self, data):
        if hasattr(data, '__iter__') and len(data) > 1:
            return [self.tokenize(x) for x in data]

        if isinstance(data, str):
            return self.tokens[data]

    def decode(self, data):
        if hasattr(data, '__iter__') and len(data) > 1:
            return [self.decode(x) for x in data]

        return self.reverse_tokens[data]

In [21]:
tokenizer = Tokenizer()
tokenizer.train(iliad)

print(tokenizer.tokens)
print(tokenizer.n)

{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, '*': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '?': 27, '[': 28, ']': 29, '_': 30, 'a': 31, 'b': 32, 'c': 33, 'd': 34, 'e': 35, 'f': 36, 'g': 37, 'h': 38, 'i': 39, 'j': 40, 'k': 41, 'l': 42, 'm': 43, 'n': 44, 'o': 45, 'p': 46, 'q': 47, 'r': 48, 's': 49, 't': 50, 'u': 51, 'v': 52, 'w': 53, 'x': 54, 'y': 55, 'z': 56, '§': 57, 'à': 58, 'ä': 59, 'æ': 60, 'è': 61, 'é': 62, 'ê': 63, 'ë': 64, 'ï': 65, 'ò': 66, 'ô': 67, 'ö': 68, 'ù': 69, 'ü': 70, 'œ': 71, 'α': 72, 'β': 73, 'γ': 74, 'δ': 75, 'ε': 76, 'η': 77, 'θ': 78, 'ι': 79, 'κ': 80, 'λ': 81, 'μ': 82, 'ν': 83, 'ξ': 84, 'ο': 85, 'π': 86, 'ρ': 87, 'ς': 88, 'σ': 89, 'τ': 90, 'υ': 91, 'φ': 92, 'χ': 93, 'ω': 94, 'ἀ': 95, 'ἂ': 96, 'ἄ': 97, 'ἆ': 98, 'ἐ': 99, 'ἑ': 100, 'ἕ': 101, 'ἡ': 102, 'ἤ': 103, 'ἦ': 104, 'ἰ': 105, 'ἱ': 106, 'ἴ': 107, 'ἵ': 108, 'ἷ': 109, 'ὀ': 110

Correcte tokenizer

In [11]:
def sliding_window(data, kernel_size, tokenizer):
    X = []
    Y = []

    for i in range(0, len(data)-kernel_size, 1):
        X.append(tokenizer.tokenize(data[i:i+kernel_size]))
        Y.append(tokenizer.tokenize(data[i+kernel_size]))

    X = np.vstack(X)
    X = X.reshape(X.shape[0], X.shape[1], 1)
    return X, Y

Dataset opbouwen met de sliding window functie en normaliseren volgens aantal tokens in tokenizer

In [12]:
x, y = sliding_window(iliad, 100, tokenizer)
samples = x / tokenizer.n
print(samples.shape)
print("sample", samples[:1])
print("target", y[:1])

(1116689, 100, 1)
sample [[[0.34246575]
  [0.26027397]
  [0.23972603]
  [0.00684932]
  [0.31506849]
  [0.32876712]
  [0.30821918]
  [0.2739726 ]
  [0.23972603]
  [0.2260274 ]
  [0.34246575]
  [0.00684932]
  [0.25342466]
  [0.34931507]
  [0.34246575]
  [0.23972603]
  [0.30136986]
  [0.21917808]
  [0.23972603]
  [0.32876712]
  [0.25342466]
  [0.00684932]
  [0.23972603]
  [0.21917808]
  [0.30821918]
  [0.30821918]
  [0.28082192]
  [0.00684932]
  [0.30821918]
  [0.24657534]
  [0.00684932]
  [0.34246575]
  [0.26027397]
  [0.23972603]
  [0.00684932]
  [0.26712329]
  [0.28767123]
  [0.26712329]
  [0.21232877]
  [0.23287671]
  [0.        ]
  [0.00684932]
  [0.00684932]
  [0.00684932]
  [0.00684932]
  [0.        ]
  [0.34246575]
  [0.26027397]
  [0.26712329]
  [0.33561644]
  [0.00684932]
  [0.23972603]
  [0.21917808]
  [0.30821918]
  [0.30821918]
  [0.28082192]
  [0.00684932]
  [0.26712329]
  [0.33561644]
  [0.00684932]
  [0.24657534]
  [0.30821918]
  [0.32876712]
  [0.00684932]
  [0.34246575]


Converting targets naar one-hot encoded representatie

In [13]:
targets = tf.keras.utils.to_categorical(y)

Model opbouwen en trainen

In [14]:
model = tf.keras.Sequential([
    tf.keras.Input([100, 1]),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(tokenizer.n, activation='softmax')
])

In [15]:
model.compile(optimizer="adam", loss="categorical_crossentropy")

In [16]:
model.fit(samples, targets, batch_size=128, epochs=20, callbacks=[tf.keras.callbacks.ModelCheckpoint("./models/model.keras", monitor='loss', verbose=1, save_best_only=True, mode='min')])

Epoch 1/20
[1m8723/8725[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - loss: 3.0949
Epoch 1: loss improved from inf to 3.07672, saving model to ./models/model.keras
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 30ms/step - loss: 3.0949
Epoch 2/20
[1m8724/8725[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 31ms/step - loss: 2.9818
Epoch 2: loss improved from 3.07672 to 2.89763, saving model to ./models/model.keras
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 31ms/step - loss: 2.9818
Epoch 3/20
[1m8724/8725[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 31ms/step - loss: 2.6853
Epoch 3: loss improved from 2.89763 to 2.59758, saving model to ./models/model.keras
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 31ms/step - loss: 2.6853
Epoch 4/20
[1m8724/8725[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 31ms/step - loss: 2.4073
Epoch 4: loss improved from 2.59758 to 2.36177, 

<keras.src.callbacks.history.History at 0x7eff86cd5190>

Trainen duurde erg lang, na een tijdje te klooien met wsl en cuda heb ik het uiteindelijk aan de praat gekregen op google collab. ~4 minuten per epoch

In [17]:
best_model = tf.keras.models.load_model("./models/model.keras", compile=True)

In [32]:
def predict(seq, n):
    for _ in range(n):
        sample = np.squeeze(seq[-100:])
        # print(tokenizer.decode(list(sample)))
        sample = (sample / tokenizer.n).reshape(1, 100, 1) # Take last 100 characters
        pred = np.squeeze(best_model.predict(sample, verbose=0)) # Remove empty dimensions
        pred_int = int(tf.math.argmax(pred))
        seq.append(np.array([pred_int]))

    seq = np.squeeze(seq)

    print("Original sample:\n", "".join(tokenizer.decode(seq[:100])))
    print("")
    print("Volledig voorspelde sample:\n", "".join(tokenizer.decode(seq)))

In [33]:
predict(list(x[2000]), 1000)

Original sample:
  casting his spear at mars
 juno
 hector chiding paris
 the meeting of hector and andromache
 bows a

Volledig voorspelde sample:
  casting his spear at mars
 juno
 hector chiding paris
 the meeting of hector and andromache
 bows and the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sea

In [None]:
predict(list(x[6000]), 1000)

Original sample:
 ere thy sway the curse of meaner powers,
and thou the shame of any host but ours!
a host, by jove en

Volledig voorspelde sample:
 ere thy sway the curse of meaner powers,
and thou the shame of any host but ours!
a host, by jove ene the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the t

In [34]:
predict(list(x[9000]), 1000)

Original sample:
 f ithagenes.
although poor, he married, and the result of the union was a girl named
critheïs. the g

Volledig voorspelde sample:
 f ithagenes.
although poor, he married, and the result of the union was a girl named
critheïs. the gods and sound the sanks of his conpuest of the sanks of his the sage of high;
the sanks of his the sanks of his the sage of homes and saised the sanks of his the sanks of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sears of the sear

Het is nog niet bepaald een woordentovenaar... Hij herhaald heel de tijd dezelfde woorden. Hij heeft in ieder geval een paar woorden geleerd.
In de laatste sample zie je wel dat hij een echt stukje tekst heeft gemaakt "the gods and sound the sanks of his conpuest of the sanks of his the sage of high;" Niet dat dit stukje tekst ergens op slaat maar er bepaalde patronen zoals de spelling van de woorden en de zins structuur die zeker laten zien dat er iets getrained is

Een paar verbeteringen zouden zijn:
- (veel) Langere training tijden. Grote LLM's worden getrained verspreid over honderden gpus voor duizenden uren. Maar in dit geval met deze architectuur weet ik niet hoeveel er uit te halen is meer nog langer trainen.
- Betere tokenizer. In de ervaring die ik met het trainen/finetunen van grotere modellen heb is dat de tokenizer grotere tokens heeft. Dus ook kleine veelgebruikte woorden en woord-delen.
- Een grotere, gevarieerde trainingset zou ook helpen met het verbeteren van het model
- andere architectuur. Het model dat we nu hebben heeft niet heel erg veel weights, een groter model zou meer nuances eruit kunnen halen denk ik. Ook de architectuur van een LSTM kan worden verbetered door bijvoorbeeld naar een transformer model te gaan zoals chatgpt of een mixture of experts (MoE) zoals mixtral