In [None]:

# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, Dropout, Bidirectional

# Step 1: Load the dataset (your dataset with Harakat)
# Assuming you have a CSV or txt file with two columns: "Text_without_Harakat" and "Text_with_Harakat"
df = pd.read_csv('harakat_data.csv')

# Step 2: Data Preprocessing
# Function to preprocess the text (remove punctuation, normalize, etc.)
def preprocess_text(text):
    # Normalize Arabic text (remove tashkeel, punctuations, etc.)
    text = text.replace("
", " ").strip()
    return text

# Apply preprocessing to both the columns
df['Text_without_Harakat'] = df['Text_without_Harakat'].apply(preprocess_text)
df['Text_with_Harakat'] = df['Text_with_Harakat'].apply(preprocess_text)

# Step 3: Tokenization
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['Text_without_Harakat'].tolist() + df['Text_with_Harakat'].tolist())

# Get the number of unique characters
num_tokens = len(tokenizer.word_index) + 1  # Add 1 for padding

# Convert text to sequence of tokens (characters)
X = tokenizer.texts_to_sequences(df['Text_without_Harakat'].tolist())
Y = tokenizer.texts_to_sequences(df['Text_with_Harakat'].tolist())

# Padding the sequences to have the same length
max_len = max([len(seq) for seq in X])  # Choose the maximum length of the sequences
X_pad = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_len, padding='post')
Y_pad = tf.keras.preprocessing.sequence.pad_sequences(Y, maxlen=max_len, padding='post')

# Step 4: Data Splitting (Train and Validation)
X_train, X_val, Y_train, Y_val = train_test_split(X_pad, Y_pad, test_size=0.2, random_state=42)

# Step 5: Model Architecture
model = Sequential()
model.add(Embedding(num_tokens, 128, input_length=max_len))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Dense(num_tokens, activation='softmax'))

# Step 6: Compile the Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 7: Model Training with Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(X_train, to_categorical(Y_train, num_tokens), epochs=10, batch_size=64, 
                    validation_data=(X_val, to_categorical(Y_val, num_tokens)), 
                    callbacks=[early_stopping])

# Step 8: Model Evaluation
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='val accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Step 9: Inference (Prediction)
def predict_harakat(text):
    text_seq = tokenizer.texts_to_sequences([text])
    text_pad = tf.keras.preprocessing.sequence.pad_sequences(text_seq, maxlen=max_len, padding='post')
    pred = model.predict(text_pad)
    pred_text = tokenizer.sequences_to_texts(np.argmax(pred, axis=-1))
    return pred_text[0]

# Test the model
input_text = "المدرسة"
predicted_text = predict_harakat(input_text)
print(f"Input: {input_text}")
print(f"Predicted with Harakat: {predicted_text}")
