In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from cleaner import to_dotless_text, clean_text
import pandas as pd
import numpy as np


In [None]:
data = pd.read_parquet('../dataset/cleaned_train.parquet')

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['dotless'])

word_index = tokenizer.word_index
print(f"word index: {len(word_index)}")

dotless_sequences = tokenizer.texts_to_sequences(data['dotless'])
padded_dotless = pad_sequences(dotless_sequences, maxlen=100, padding='post', truncating='post')

print(f"padded shape: {padded_dotless.shape}")


In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['text'])

word_index = tokenizer.word_index
print(f"word index: {len(word_index)}")

text_sequences = tokenizer.texts_to_sequences(data['text'])
padded_text = pad_sequences(text_sequences, maxlen=100, padding='post', truncating='post')

print(f"padded shape: {padded_text.shape}")

In [None]:
from sklearn.model_selection import train_test_split

X = padded_dotless
y = padded_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# train an RNN model with architecture: Embedding 256 -> Bigru 256 -> BIGRU 256 -> Dense 1024 -> Dropout 0.5 -> Dense 

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(10000, 256, input_length=100),
    Bidirectional(LSTM(256, return_sequences=True)),
    Bidirectional(LSTM(256, return_sequences=True)),
    Bidirectional(LSTM(256, return_sequences=True)),
    Dense(1024, activation='relu'),
    Dropout(0.5),
    Dense(10000, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, 100))  # Assuming input sequence length is 100
model.summary()


In [None]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

model.save('rnn_model.h5')

# test the model
from tensorflow.keras.models import load_model

model = load_model('rnn_model.h5')

def predict_text(text):
    text = clean_text(text)
    dotless = to_dotless_text(text)
    sequence = tokenizer.texts_to_sequences([dotless])
    padded_sequence = pad_sequences(sequence, maxlen=100, padding='post', truncating='post')
    prediction = model.predict(padded_sequence)
    return prediction

text_arabic ="ويكيبيديا مشروع تعاوني متعدد اللغات يضم ويكيات بأكثر من"
dotless_text_arabic = to_dotless_text(text_arabic)
prediction = predict_text(dotless_text_arabic)
print(f"prediction: {prediction}")