In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, TimeDistributed, Dropout, RepeatVector, Concatenate
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
import pickle
import os
import typing
from typing import Any, Tuple
import einops
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from collections import Counter
import string
import re
import pathlib
data = pd.read_csv("ara_eng.txt", sep="\t", header=None, names=["english", "arabic"])
data.head()
num_duplicates = data.duplicated().sum()
print(f"Number of Duplicate Rows: {num_duplicates}")
# Remove duplicates from data
data = data.drop_duplicates()

num_duplicates = data.duplicated().sum()
print(f"Number of Duplicate Rows: {num_duplicates}")
import plotly.subplots as sp
import plotly.graph_objects as go

input_lengths = [len(seq.split()) for seq in data['english']]
output_lengths = [len(seq.split()) for seq in data['arabic']]

fig = sp.make_subplots(rows=1, cols=2, subplot_titles=('English Sentence Lengths', 'Arabic Sentence Lengths'))

hist_input = go.Histogram(x=input_lengths, nbinsx=50, name='English')
hist_output = go.Histogram(x=output_lengths, nbinsx=50, name='Arabic')

fig.add_trace(hist_input, row=1, col=1)
fig.add_trace(hist_output, row=1, col=2)

fig.update_layout(showlegend=False, title_text='Distribution of Sentence Lengths')
fig.update_xaxes(title_text='Sentence Length', row=1, col=1)
fig.update_xaxes(title_text='Sentence Length', row=1, col=2)

fig.show()
import plotly.graph_objects as go

unique_words_input = len(set(word for seq in data['english'] for word in seq.split()))
unique_words_output = len(set(word for seq in data['arabic'] for word in seq.split()))

fig = go.Figure()

fig.add_trace(go.Bar(x=['English'], y=[unique_words_input], name='English'))
fig.add_trace(go.Bar(x=['Arabic'], y=[unique_words_output], name='Arabic'))

fig.update_layout(title_text='Total Number of Unique Words in Each Language', barmode='group', xaxis_title='Language', yaxis_title='Total Unique Words')

fig.show()
import plotly.express as px

# Count the number of entries with only less than 5 words for English
english_one_word_count = data['english'].apply(lambda x: 1 if len(str(x).split()) < 5 else 0).sum()

# Count the number of entries with only less than 5 words for Arabic
arabic_one_word_count = data['arabic'].apply(lambda x: 1 if len(str(x).split()) < 5 else 0).sum()

# Create a DataFrame for the plot
info = {'Language': ['English', 'Arabic'], 'Entries with less than 5 words': [english_one_word_count, arabic_one_word_count]}
df_plot = pd.DataFrame(info)

# Plot the graph using Plotly Express
fig = px.bar(df_plot, x='Language', y='Entries with less than 5 words', text='Entries with less than 5 words',
             title='Number of Entries with less than 5 words',
             labels={'Entries with less than 5 words': 'Number of Entries'})

# Show the plot
fig.show()
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)
pos_encoding = positional_encoding(length=2048, depth=512)

# Check the shape.
print(pos_encoding.shape)

# Plot the dimensions.
plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')
plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()


In [77]:
def load_data(data):
  context = np.array([context for context in data["english"]])
  target = np.array([target for target in data["arabic"]])

  return target, context

target_raw, context_raw = load_data(data)
print(context_raw[-1])

print(target_raw[-1])

you will also find links to some free web based platforms to create and save your creations in the week prior to international mother language day we ll be sharing retweeting and liking contributions from around the world and featuring some of our favorites here on rising voices.
ستجد ايضا روابط لمجموعة من منصات ابداع الميم المجانية لمساعدتك على ابتكار وحفظ ما تنتجه خلال الشهر السابق لليوم العالمي للغة الام سنواصل تعزيز الفكرة من خلال نشر واعادة تغريد المشاركات من جميع انحاء العالم وتخصيص مساحة لتلك المفضلة لدينا هنا على الاصوات الصاعدة


In [80]:
BUFFER_SIZE = len(context_raw)
BATCH_SIZE = 64
is_train = np.random.uniform(size=(len(target_raw),)) < 0.8
train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw, target_raw))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [None]:
def tf_lower_and_split_punct(text):
  # Split accented characters.
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-zا-ي.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  # Strip whitespace.
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

In [81]:
max_vocab_size = 60000

context_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

context_text_processor.adapt(train_raw.map(lambda context, target: context))

# Here are the first 10 words from the vocabulary:
print(context_text_processor.get_vocabulary()[:10])
print(context_text_processor.get_vocabulary()[-10:])
print(len(context_text_processor.get_vocabulary()))
target_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

target_text_processor.adapt(train_raw.map(lambda context, target: target))
print(target_text_processor.get_vocabulary()[:10])
print(len(target_text_processor.get_vocabulary()))

NameError: name 'tf_lower_and_split_punct' is not defined

In [82]:
def process_text(context, target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

NameError: in user code:

    File "C:\Users\Abdelrahman Abdullah\AppData\Local\Temp\ipykernel_1032\2396223496.py", line 2, in process_text  *
        context = context_text_processor(context).to_tensor()

    NameError: name 'context_text_processor' is not defined


In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    context, x = inputs  # Unpack the context and target
    context = self.encoder(context)  # (batch_size, context_len, d_model)
    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
        del logits._keras_mask
    except AttributeError:
        pass

    return logits



In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 40000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
train_en_raw = tf.constant(train_df['english'].values)
train_ar_raw = tf.constant(train_df['arabic'].values)

NameError: name 'tf' is not defined

In [None]:
is_train = np.random.uniform(size=(len(train_en_raw),)) < 0.8
BUFFER_SIZE = len(train_en_raw)
BATCH_SIZE = 16

# Train And Validation Split
train_ds = (tf.data.Dataset
                    .from_tensor_slices((train_en_raw, train_ar_raw)))

validation_ds = (tf.data.Dataset
                    .from_tensor_slices((train_en_raw[~is_train], train_ar_raw[~is_train])))

In [None]:
# Extract English And Arabic sentences for tokenization
train_en = train_ds.map(lambda en, ar: en)
train_ar = train_ds.map(lambda en, ar: ar)

In [None]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [None]:
# Train the tokenizers on the data
ar_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ar.prefetch(tf.data.AUTOTUNE),
    **bert_vocab_args
)

en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.prefetch(tf.data.AUTOTUNE),
    **bert_vocab_args
)

In [None]:
# Save the tokenizers' vocabulary
directory = './tokenizer/subword/'
if not os.path.exists(directory):
    os.makedirs(directory)

write_vocab_file('./tokenizer/subword/ar_vocab.txt', ar_vocab)
write_vocab_file('./tokenizer/subword/en_vocab.txt', en_vocab)

In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

In [None]:
def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [None]:
tokenizers = tf.Module()
tokenizers.ar = CustomTokenizer(reserved_tokens, './tokenizer/subword/ar_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, './tokenizer/subword/en_vocab.txt')

In [None]:
model_name = './tokenizer/subword/en_ar_tokenizer'
tf.saved_model.save(tokenizers, model_name)

In [None]:
tokenizers = tf.saved_model.load(model_name)
tokenizers.en.get_vocab_size().numpy()

In [None]:
tokens = tokenizers.en.tokenize(['Hello TensorFlow! ksdjfgsdjg'])
tokens.numpy()

In [None]:
text_tokens = tokenizers.en.lookup(tokens)
text_tokens

In [None]:
round_trip = tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

In [None]:
MAX_TOKENS=128
def prepare_batch(en, ar):
    en = tokenizers.en.tokenize(en)      # Output is ragged.
    en = en[:, :MAX_TOKENS]    # Trim to MAX_TOKENS.
    en = en.to_tensor()  # Convert to 0-padded dense Tensor

    ar = tokenizers.ar.tokenize(ar)
    ar = ar[:, :(MAX_TOKENS+1)]
    ar_inputs = ar[:, :-1].to_tensor()  # Drop the [END] tokens
    ar_labels = ar[:, 1:].to_tensor()   # Drop the [START] tokens

    return (en, ar_inputs), ar_labels

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE)
  )

In [None]:
# Create training and validation set batches.
train_batches = make_batches(train_ds)
val_batches = make_batches(validation_ds)

In [None]:
for (en_i, ar_i), ar_o in train_batches.take(1):
    print(tokenizers.en.detokenize(en_i)[0])
    print(tokenizers.ar.detokenize(ar_i)[0].numpy().decode())    
    print(tokenizers.ar.detokenize(ar_o)[0].numpy().decode())
    print(tokenizers.en.lookup(en_i)[0])
    print(tokenizers.ar.lookup(ar_i)[0])    
    print(tokenizers.ar.lookup(ar_o)[0])