In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install torch

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import random

In [None]:
import json
with open("/content/drive/MyDrive/Disertatie/Lyrics_ExtractTopics_12.06.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
texts = []
themes = []
for song in data:
    theme = song.get("theme", "unknown")
    lyrics = song["lyrics"].replace("\n", " ").strip()
    if lyrics:
        texts.append(f"{theme.lower()} {lyrics.lower()}")
        themes.append(theme.lower())

In [None]:
vocab_size = 10000
max_len = 30
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

input_sequences = []
for line in texts:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [None]:
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
x, y = input_sequences[:,:-1], input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

In [None]:
print(max_seq_len)

2204


In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_seq_len - 1))
model.add(LSTM(128, implementation=2))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
from tqdm.auto import tqdm
from tensorflow.keras.callbacks import Callback
import time

class CustomProgressBar(Callback):
    def on_train_begin(self, logs=None):
        self.epochs = self.params.get('epochs', 0)
        self.steps = self.params.get('steps', 0)
        self.epoch_bar = tqdm(total=self.epochs, desc="Training", position=0)

    def on_epoch_begin(self, epoch, logs=None):
        self.step_bar = tqdm(total=self.steps, desc=f"Epoch {epoch+1}", position=1, leave=False)
        self.epoch_start = time.time()

    def on_train_batch_end(self, batch, logs=None):
        self.step_bar.update(1)

    def on_epoch_end(self, epoch, logs=None):
        self.step_bar.close()
        duration = time.time() - self.epoch_start
        self.epoch_bar.set_postfix_str(f"Epoch time: {duration:.2f}s")
        self.epoch_bar.update(1)

    def on_train_end(self, logs=None):
        self.epoch_bar.close()

In [None]:
print(x.shape)
print(x.dtype)

(256904, 2203)
int32


In [None]:
x = x.astype('float32')

In [None]:
model.fit(x, y, epochs=30, batch_size=64, callbacks=[CustomProgressBar()])

Training:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 1/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 94ms/step - accuracy: 0.0521 - loss: 6.6392


Epoch 2:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 2/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.1125 - loss: 5.7004


Epoch 3:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 3/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.1510 - loss: 5.2184


Epoch 4:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 4/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 97ms/step - accuracy: 0.1876 - loss: 4.8055


Epoch 5:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 5/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 97ms/step - accuracy: 0.2225 - loss: 4.4367


Epoch 6:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 6/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 97ms/step - accuracy: 0.2548 - loss: 4.1358


Epoch 7:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 7/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.2852 - loss: 3.8761


Epoch 8:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 8/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.3164 - loss: 3.6379


Epoch 9:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 9/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.3451 - loss: 3.4204


Epoch 10:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 10/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.3717 - loss: 3.2361


Epoch 11:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 11/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.3994 - loss: 3.0612


Epoch 12:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 12/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.4218 - loss: 2.9214


Epoch 13:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 13/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.4458 - loss: 2.7780


Epoch 14:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 14/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.4675 - loss: 2.6621


Epoch 15:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 15/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.4860 - loss: 2.5455


Epoch 16:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 16/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.5057 - loss: 2.4404


Epoch 17:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 17/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 96ms/step - accuracy: 0.5234 - loss: 2.3476


Epoch 18:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 18/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.5374 - loss: 2.2614


Epoch 19:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 19/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.5517 - loss: 2.1898


Epoch 20:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 20/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.5631 - loss: 2.1252


Epoch 21:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 21/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.5755 - loss: 2.0543


Epoch 22:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 22/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.5874 - loss: 1.9875


Epoch 23:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 23/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.5997 - loss: 1.9322


Epoch 24:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 24/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.6069 - loss: 1.8827


Epoch 25:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 25/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.6156 - loss: 1.8329


Epoch 26:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 26/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 97ms/step - accuracy: 0.6245 - loss: 1.7874


Epoch 27:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 27/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.6353 - loss: 1.7370


Epoch 28:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 28/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.6420 - loss: 1.7038


Epoch 29:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 29/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.6476 - loss: 1.6674


Epoch 30:   0%|          | 0/4015 [00:00<?, ?it/s]

Epoch 30/30
[1m4015/4015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 97ms/step - accuracy: 0.6541 - loss: 1.6354


<keras.src.callbacks.history.History at 0x7d9a7a637a50>

In [None]:
model.save("/content/drive/MyDrive/lstm_kendrick_model.h5")
with open("/content/drive/MyDrive/tokenizer_kendrick.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

