In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding, InputLayer
import string

In [2]:
# df = pd.read_csv("train.csv")
# len(df)

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/CostiCTI/CourseML/refs/heads/main/Part2-Models/train.csv")

In [4]:
data = list(df['text'])

In [5]:
data[0]

'acest document mi-a deschis cu adevarat ochii la ceea ce oamenii din afara statelor unite s-au gandit la atacurile din 11 septembrie. acest film a fost construit in mod expert si prezinta acest dezastru ca fiind mai mult decat un atac asupra pamantului american. urmarile acestui dezastru sunt previzionate din multe tari si perspective diferite. cred ca acest film ar trebui sa fie mai bine distribuit pentru acest punct. de asemenea, el ajuta in procesul de vindecare sa vada in cele din urma altceva decat stirile despre atacurile teroriste. si unele dintre piese sunt de fapt amuzante, dar nu abuziv asa. acest film a fost extrem de recomandat pentru mine, si am trecut pe acelasi sentiment.'

In [6]:
def generate_subsentences(sentences: list[str]) -> list[str]:
    result = []
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    for sentence in sentences:
        try:
            clean_sentence = sentence.translate(translator)
            words = clean_sentence.split()
            for length in range(2, 5):
                for start in range(len(words) - length + 1):
                    subsentence = ' '.join(words[start:start + length])
                    result.append(subsentence)
            result.append("<start>" + " " + sentence.split(" ")[0])
        except:
            pass
    return result

#sentences = ["the cat sat on the table", "i like it"]
#print(generate_subsentences(sentences))

In [7]:
props = generate_subsentences(data)
print (len(props))

4090603


In [8]:
import random

random.shuffle(props)

In [9]:
props[0]

'de productie'

In [10]:
props = props[:500000]
print (len(props))

500000


In [11]:
NO_WORDS = 2000

In [12]:
tokenizer = Tokenizer(num_words=NO_WORDS, oov_token='unktoken')
tokenizer.fit_on_texts(props)

In [13]:
len(tokenizer.index_word)

43834

In [14]:
tokenizer.index_word[1]

'unktoken'

In [16]:
#tokenizer.index_word

In [17]:
oftenit = []
for k, v in tokenizer.index_word.items():
    if k <= NO_WORDS:
        oftenit.append(v)
print (len(oftenit))

2000


In [18]:
len(oftenit)

2000

In [19]:
#tokenizer.index_word

In [20]:
props[10]

'ca un individ'

In [21]:
sequences = tokenizer.texts_to_sequences(props)

In [22]:
sequences[10]

[9, 11, 1]

In [23]:
sequences[124]

[4, 21, 9, 10]

In [30]:
xsequences = []
for seq in sequences:
    if len(seq) > 4 or seq[-1] == 1:
        pass
    else:
        xsequences.append(seq)
print (len(xsequences))

410561


In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

padded = pad_sequences(xsequences, padding='pre')

In [32]:
padded[124]

array([  9, 351, 163,   7], dtype=int32)

In [33]:
print (padded[32])
print (padded[100])
print (padded[124])

[  1 116  29 715]
[ 0 31  1  6]
[  9 351 163   7]


In [34]:
len(padded)

410561

In [35]:
X, y = padded[:,:-1], padded[:,-1]
y = to_categorical(y, num_classes=NO_WORDS + 1)

In [36]:
from sklearn.model_selection import train_test_split
X_train = X[:350000]
X_test = X[350000:]
y_train = y[:350000]
y_test = y[350000:]
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(350000, 3)
(350000, 2001)
(60561, 3)
(60561, 2001)


In [37]:
def top_3_accuracy(y_true, y_pred):
    y_true = tf.cast(tf.argmax(y_true, axis=-1), tf.int32)  # Convert one-hot to integer labels
    top_3 = tf.math.top_k(y_pred, k=3).indices
    matches = tf.reduce_any(tf.equal(tf.expand_dims(y_true, -1), top_3), axis=-1)
    return tf.reduce_mean(tf.cast(matches, tf.float32))

In [39]:
model = Sequential()
model.add(InputLayer(input_shape=(3, ), dtype=np.int32))
model.add(Embedding(NO_WORDS + 1, 8, input_length=4))
model.add(LSTM(8))
model.add(Dense(NO_WORDS + 1, activation='softmax'))
print(model.summary())

None


In [40]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', top_3_accuracy])

In [41]:
model.fit(X_train, y_train, batch_size=16, epochs=4, validation_data=(X_test, y_test))

Epoch 1/4
[1m21875/21875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 4ms/step - accuracy: 0.0551 - loss: 5.8795 - top_3_accuracy: 0.1230 - val_accuracy: 0.0911 - val_loss: 5.4227 - val_top_3_accuracy: 0.1934
Epoch 2/4
[1m21875/21875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 3ms/step - accuracy: 0.1040 - loss: 5.3442 - top_3_accuracy: 0.2059 - val_accuracy: 0.1393 - val_loss: 5.1494 - val_top_3_accuracy: 0.2380
Epoch 3/4
[1m21875/21875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3ms/step - accuracy: 0.1425 - loss: 5.0832 - top_3_accuracy: 0.2433 - val_accuracy: 0.1530 - val_loss: 5.0072 - val_top_3_accuracy: 0.2518
Epoch 4/4
[1m21875/21875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 4ms/step - accuracy: 0.1552 - loss: 4.9483 - top_3_accuracy: 0.2573 - val_accuracy: 0.1575 - val_loss: 4.9321 - val_top_3_accuracy: 0.2633


<keras.src.callbacks.history.History at 0x7bb5c0c0f2d0>

In [42]:
preds = model.predict(X_test)

[1m1893/1893[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


In [45]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
results = []
wordsr = []
for pred in preds:
    ar = pred.argsort()[-3:][::-1]
    results.append([ar[0], ar[1], ar[2]])
    wordsr.append([reverse_word_map[ar[0]], reverse_word_map[ar[1]], reverse_word_map[ar[2]]])

In [46]:
testy = [np.argmax(x) for x in y_test]

In [47]:
len(testy)

60561

In [48]:
acc1 = 0
acc2 = 0
acc3 = 0
for i in range(len(results)):
    if results[i][0] == testy[i]:
        acc1 += 1
    if testy[i] in results[i][:2]:
        acc2 += 1
    if testy[i] in results[i][:3]:
        acc3 += 1
print ('R1:', acc1 / len(testy))
print ('R2:', acc2 / len(testy))
print ('R3:', acc3 / len(testy))

R1: 0.15746107230726045
R2: 0.2236257657568402
R3: 0.2633873284787239


In [49]:
aux = "salut ce faci"
example = "imi place"

In [50]:
example_seq = tokenizer.texts_to_sequences([aux, example])
print (example_seq)

[[1, 25, 783], [99, 140]]


In [51]:
example_padded = pad_sequences(example_seq, padding='pre')
print (example_padded)

[[  1  25 783]
 [  0  99 140]]


In [52]:
pred = model.predict(example_padded)[1]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


In [53]:
ar = pred.argsort()[-3:][::-1]
res = [ar[0], ar[1], ar[2]]
words_pred = [reverse_word_map[ar[0]], reverse_word_map[ar[1]], reverse_word_map[ar[2]]]

In [54]:
res

[7, 20, 5]

In [55]:
words_pred

['sa', 'acest', 'in']