In [99]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding, InputLayer
import string

In [100]:
df = pd.read_csv("train.csv")
len(df)

17941

In [101]:
data = list(df['text'])

In [102]:
def generate_subsentences(sentences: list[str]) -> list[str]:
    result = []
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    for sentence in sentences:
        try:
            clean_sentence = sentence.translate(translator)
            words = clean_sentence.split()
            for length in range(2, 6):
                for start in range(len(words) - length + 1):
                    subsentence = ' '.join(words[start:start + length])
                    result.append(subsentence)
            result.append("<start>" + " " + sentence.split(" ")[0])
        except:
            pass
    return result

#sentences = ["the cat sat on the table", "i like it"]
#print(generate_subsentences(sentences))

In [103]:
props = generate_subsentences(data)
print (len(props))

5413495


In [104]:
import random

random.shuffle(props)

In [105]:
props[0]

'rapid ar fi diferit'

In [106]:
props = props[:600000]
print (len(props))

600000


In [107]:
tokenizer = Tokenizer(num_words=4000, oov_token='unktoken')
tokenizer.fit_on_texts(props)

In [108]:
len(tokenizer.index_word)

47074

In [109]:
tokenizer.index_word[1]

'unktoken'

In [110]:
NO_WORDS = 4000

In [111]:
oftenit = []
for k, v in tokenizer.index_word.items():
    if k <= NO_WORDS:
        oftenit.append(v)
print (len(oftenit))

4000


In [112]:
len(oftenit)

4000

In [113]:
#tokenizer.index_word

In [114]:
props[10]

'cei mici care striga'

In [115]:
sequences = tokenizer.texts_to_sequences(props)

In [116]:
sequences[10]

[131, 604, 13, 3029]

In [117]:
sequences[124]

[1873, 295, 117, 3950, 1]

In [119]:
xsequences = []
for seq in sequences:
    if len(seq) > 5 or seq[-1] == 1:
        pass
    else:
        xsequences.append(seq)
print (len(xsequences))

525657


In [120]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

padded = pad_sequences(xsequences, padding='pre')

In [121]:
padded[124]

array([   0,    0, 1680,   17,  212])

In [122]:
print (padded[32])
print (padded[100])
print (padded[124])

[ 17  39 121  43  11]
[  0   9 600   2 819]
[   0    0 1680   17  212]


In [123]:
len(padded)

525657

In [124]:
X, y = padded[:,:-1], padded[:,-1]
y = to_categorical(y, num_classes=NO_WORDS + 1)

In [125]:
from sklearn.model_selection import train_test_split
X_train = X[:450000]
X_test = X[450000:]
y_train = y[:450000]
y_test = y[450000:]
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(450000, 4)
(450000, 4001)
(75657, 4)
(75657, 4001)


In [134]:
def top_3_accuracy(y_true, y_pred):
    y_true = tf.cast(tf.argmax(y_true, axis=-1), tf.int32)  # Convert one-hot to integer labels
    top_3 = tf.math.top_k(y_pred, k=3).indices
    matches = tf.reduce_any(tf.equal(tf.expand_dims(y_true, -1), top_3), axis=-1)
    return tf.reduce_mean(tf.cast(matches, tf.float32))

In [135]:
model = Sequential()
model.add(InputLayer(input_shape=(4, ), dtype=np.int32))
model.add(Embedding(NO_WORDS + 1, 24, input_length=4))
model.add(LSTM(64))
model.add(Dense(NO_WORDS + 1, activation='softmax'))
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 4, 24)             96024     
                                                                 
 lstm_3 (LSTM)               (None, 64)                22784     
                                                                 
 dense_3 (Dense)             (None, 4001)              260065    
                                                                 
Total params: 378,873
Trainable params: 378,873
Non-trainable params: 0
_________________________________________________________________
None


In [136]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', top_3_accuracy])

In [137]:
model.fit(X_train, y_train, batch_size=128, epochs=4, validation_data=(X_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x2388ea2a850>

In [138]:
preds = model.predict(X_test)



In [139]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
results = []
wordsr = []
for pred in preds:
    ar = pred.argsort()[-5:][::-1]
    results.append([ar[0], ar[1], ar[2], ar[3], ar[4]])
    wordsr.append([reverse_word_map[ar[0]], reverse_word_map[ar[1]], reverse_word_map[ar[2]], reverse_word_map[ar[3]], reverse_word_map[ar[4]]])

In [140]:
testy = [np.argmax(x) for x in y_test]

In [141]:
len(testy)

75657

In [142]:
acc1 = 0
acc2 = 0
acc3 = 0
for i in range(len(results)):
    if results[i][0] == testy[i]:
        acc1 += 1
    if testy[i] in results[i][:2]:
        acc2 += 1
    if testy[i] in results[i][:3]:
        acc3 += 1
print ('R1:', acc1 / len(testy))
print ('R2:', acc2 / len(testy))
print ('R3:', acc3 / len(testy))

R1: 0.1750796357243877
R2: 0.24408845182864772
R3: 0.2886976750333743


In [159]:
aux = "salut ce faci in"
example = "imi place"

In [161]:
example_seq = tokenizer.texts_to_sequences([aux, example])
print (example_seq)

[[1, 25, 726, 5], [99, 138]]


In [162]:
example_padded = pad_sequences(example_seq, padding='pre')
print (example_padded)

[[  1  25 726   5]
 [  0   0  99 138]]


In [167]:
pred = model.predict(example_padded)[1]



In [168]:
ar = pred.argsort()[-3:][::-1]
res = [ar[0], ar[1], ar[2]]
words_pred = [reverse_word_map[ar[0]], reverse_word_map[ar[1]], reverse_word_map[ar[2]]]

In [169]:
res

[7, 20, 15]

In [170]:
words_pred

['sa', 'acest', 'la']