In [1]:
import pandas as pd
import torch
import re

In [2]:
def filter_arabic_only(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'[^\u0600-\u06FF\s0-9]', '', text)

In [3]:
datasetsCount = 1
combinedText = []
datasetsFeatures = ["text","story"]
for i in range(datasetsCount):
    df = pd.read_csv(f"../Data/RAW_dataset/ArabicLLM{i + 1}")
    df = df.dropna()
    df = df.drop_duplicates()
    df[datasetsFeatures[i]] = df[datasetsFeatures[i]].astype(str)
    df[datasetsFeatures[i]] = df[datasetsFeatures[i]].apply(filter_arabic_only)
    df[datasetsFeatures[i]] = df[datasetsFeatures[i]] + " <|endoftext|> "
    
   
    for j in df[datasetsFeatures[i]]:
        combinedText.append(j)

In [4]:
combinedText = combinedText[:60000]

In [5]:
len(combinedText)

60000

In [6]:


def clean_arabic_for_tiny_model(text):
    if not text or not isinstance(text, str):
        return ""
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'ـ', '', text)
    text = re.sub(r'[\u064B-\u065F]', '', text)
    text = re.sub(r'[أإآ]', 'ا', text)
    numbersMapping = str.maketrans('٠١٢٣٤٥٦٧٨٩', '0123456789')
    text = text.translate(numbersMapping)
    text = text.replace(',', '،').replace('?', '؟')
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def is_valid_line(text, min_words=10):
    if len(text.split()) < min_words:
        return False
    return True

cleanedText = []
for i in combinedText:
    if is_valid_line(i):
        cleanedText.append(clean_arabic_for_tiny_model(i))
        

combinedText = cleanedText

In [7]:
def countOfWords(lisOFWords:list) -> int:
    counter = 0
    for i in lisOFWords:
        text = i.split()
        counter = len(text) + counter
    return counter

In [8]:
countOfWords(combinedText)

41449683

In [None]:
combinedText

In [10]:
from transformers import AutoTokenizer


tokenizerGPT = AutoTokenizer.from_pretrained("gpt2")

customTokenizer = tokenizerGPT.train_new_from_iterator(combinedText, vocab_size=10000,new_special_tokens=["<QUESTION>","<ANSWER>","<|endoftext|>","[PAD]"],
                                                       initial_alphabet=[])

customTokenizer.add_special_tokens({'pad_token': '[PAD]'})

customTokenizer.save_pretrained("./TOKENIZER")

('./TOKENIZER\\tokenizer_config.json',
 './TOKENIZER\\special_tokens_map.json',
 './TOKENIZER\\vocab.json',
 './TOKENIZER\\merges.txt',
 './TOKENIZER\\added_tokens.json',
 './TOKENIZER\\tokenizer.json')

In [11]:
import shutil
import os

save_path = "./TOKENIZER/"

if os.path.exists(save_path):
    shutil.rmtree(save_path)
    print("Deleted")

os.makedirs(save_path, exist_ok=True)
customTokenizer.save_pretrained(save_path)
print("Saved")

Deleted
Saved


In [12]:
retrivedTokenizer = AutoTokenizer.from_pretrained("./TOKENIZER/")

In [None]:
ids = customTokenizer.encode(" الذهب")
decoded = customTokenizer.decode(ids)
print(ids)
print("Decoded Text:", decoded)


In [14]:
outputTokenizer = retrivedTokenizer(
    combinedText,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_overflowing_tokens=True,
    return_tensors="pt"
)

In [15]:
IDs = outputTokenizer["input_ids"]

masks = outputTokenizer["attention_mask"]

In [16]:
torch.save(IDs,"../Data/preproccessedData.pt")
torch.save(masks,"../Data/preproccessedDataMASKS.pt")

tokenCOUNT = outputTokenizer["attention_mask"].sum().item()
tokenCOUNT

58588589