#**Reading the datasets**

In [None]:
dataPath = "ENTER YOUR TRAIN/VALIDATION DATA PATH"
dictionary_path = 'ENTER YOUR Dictionary PATH'

In [None]:
!wget https://raw.githubusercontent.com/mohataher/arabic-stop-words/master/list.txt -O arabic_stopwords.txt



In [None]:
from google.colab import drive
# Mount Google Drive (follow the link and enter the authorization code)
drive.mount('/content/drive')

In [None]:
!pip install pyspark



In [None]:
import os
# prompt: read train.txt in pyspark after installing it
# Set PYTHONHASHSEED environment variable to '0' before importing PySpark
os.environ['PYTHONHASHSEED'] = '0'

# Now you can import PySpark and continue with your application
from pyspark import SparkContext
# Your PySpark application code here

import pyspark
sc = pyspark.SparkContext()
train_data = sc.textFile(dataPath)




In [None]:
!pip install pyspark findspark


# **Creating a dataset**


In [None]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("excel_to_rdd").getOrCreate()

# Read your CSV file into a DataFrame
df = spark.read.csv(dictionary_path, header=True, inferSchema=True, encoding="UTF-8")
rdd = df.rdd


In [None]:
rdd.take(10)

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

def strip_arabic_diacritics(word):
    if word and isinstance(word, str):
        return re.sub(r'[\u064B-\u065F]', '', word)
    return word

strip_arabic_diacritics_udf = udf(strip_arabic_diacritics, StringType())



In [None]:
# Assuming your original DataFrame is 'df'
df_with_stripped = df.withColumn("word_stripped", strip_arabic_diacritics_udf(df["word"]))


# Convert DataFrame to RDD
rdd = df_with_stripped.rdd


In [None]:
def map_function(row):
    return (row.word_stripped, row)
def reduce_function(value1, value2):
    return value1 + [value2] if isinstance(value1, list) else [value1, value2]

dictionary = rdd.map(map_function).reduceByKey(reduce_function)


In [None]:
dictionary.collect()



In [None]:
from pyspark.sql.functions import collect_list

# Group by the stripped word and collect the original words into a list
grouped_df = df_with_stripped.groupBy("word_stripped").agg(collect_list("word").alias("original_words"))

# Show the result
grouped_df.show(truncate=False)


# **Preprocessing the stemmer**

In [None]:
!wget https://raw.githubusercontent.com/mohataher/arabic-stop-words/master/list.txt -O arabic_stopwords.txt


In [None]:
import re

# Assuming 'train_data' is your dataset that you want to process

with open('arabic_stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = set(file.read().splitlines())

# Assuming 'train_data' is a collection of sentences
def remove_diacritics(sentence):
    words = sentence.split()
    return [(re.sub('[ًٌٍَُِّْ]', '', word), idx) for idx, word in enumerate(words)], ['remove_diacritics']

def remove_numbers(sentence):
    new_sentence, changes = sentence
    filtered = [(re.sub(r'\d+', '', word), idx) for word, idx in new_sentence]
    return filtered, changes + ['remove_numbers']

def remove_punctuation(sentence):
    new_sentence, changes = sentence
    filtered = [(re.sub(r'[^\w\s]', '', word), idx) for word, idx in new_sentence]
    return filtered, changes + ['remove_punctuation']

def remove_stopwords(sentence, stopwords):
    new_sentence, changes = sentence
    filtered = [(word, idx) if word not in stopwords else ('', idx) for word, idx in new_sentence]
    return filtered, changes + ['remove_stopwords']

def remove_extra_spaces_and_reconstruct(sentence):
    new_sentence, changes = sentence
    # Filter out the empty tokens and reconstruct the sentence
    reconstructed_sentence = ' '.join([word for word, idx in new_sentence if word.strip() != ''])
    return reconstructed_sentence, [item for item in new_sentence if item[0].strip() != ''], changes + ['remove_extra_spaces']

# Apply transformations
train_data_transformed = train_data.map(remove_diacritics) \
                                   .map(remove_punctuation) \
                                   .map(remove_numbers) \
                                   .map(remove_extra_spaces_and_reconstruct)
#                                  .map(lambda s: remove_stopwords(s, stopwords)) \

# Collect results to the driver for inspection
results = train_data_transformed.collect()



In [None]:
# Print results for inspection, limited to 10 iterations
for i, (original, (transformed_sentence, word_mappings, changes)) in enumerate(zip(train_data.collect(), results)):
    if i >= 10:  # Stop after 10 iterations
        break

    print(f"Original: {original}")
    print(f"Transformed: {transformed_sentence}")
    # We're no longer printing the tokenized version or the changes
    print()


In [None]:
transformed_sentences = [transformed_sentence for transformed_sentence, _, _ in results]

# Now, `transformed_sentences` contains all the full sentences after processing.
# You can pass this list to your stemming tool or further processing steps.


In [None]:
transformed_sentences

# **Stemmer**

In [None]:
pip install farasapy


In [None]:
from farasa.pos import FarasaPOSTagger
from farasa.ner import FarasaNamedEntityRecognizer
from farasa.diacratizer import FarasaDiacritizer
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer

stemmer = FarasaStemmer()


In [None]:
!pip install tqdm

# **Diacritics Generation**

In [None]:
!pip install openai
import openai

## **one item test**

In [None]:
index= 0
#train_data.take(50)[index],
test = remove_extra_spaces_and_reconstruct(remove_diacritics(train_data.take(50)[index]))[0], results[index][0], results[index][1]
test

In [None]:
from farasa.stemmer import FarasaStemmer
import tqdm

# Initialize the Farasa stemmer
stemmer = FarasaStemmer()


def stem_sentence(sentence, filtered_sentence, wordlist):
  # Stem the processed sentence
  stemmed_sentence = sentence

  # Split the stemmed sentence into words assuming spaces as delimiters
  stemmed_words = stemmed_sentence.split()

  # Process each word-index tuple to append the corresponding stemmed word
  new_tuples = []
  for word, word_index in wordlist:
      # Ensure the word index is within the bounds of stemmed_words
      if word_index < len(stemmed_words):
          stemmed_word = stemmed_words[word_index]
          new_tuples.append((word_index, stemmed_word, word))
      else:
          # In case the word index is out of bounds, append None or handle appropriately
          new_tuples.append((word, word_index, None))
  return sentence, filtered_sentence, new_tuples

# Now, `new_tuples` contains tuples of the form (original word, word index, stemmed word)
#test = train_data.take(500)[index], remove_extra_spaces_and_reconstruct(remove_diacritics(train_data.take(50)[index]))[0], results[index][0], new_tuples

#test
#stem_sentence(remove_extra_spaces_and_reconstruct(remove_diacritics(train_data.take(50)[index]))[0], results[index][0],  results[index][1])

In [None]:
dictionary.lookup("أول")

In [None]:
dictionary.take(1)

In [None]:
#test[3][15][2]

In [None]:


def generate_oneWord_diacritics(word,stemmed_word, sentence, dictionary,target):
    # Contextual explanation of the word within a sentence
    prompt_direct_diacritics = "بالنظر إلى الجملة التالية: '{}', يرجى تحليل استخدام كلمة '{}' وإضافة التشكيلات الصوتية اللازمة لها بناءً على معناها في هذا السياق. تأكد من أن التشكيل يعكس النطق الدقيق للكلمة ويتوافق مع دورها النحوي والمعنوي في الجملة.".format(sentence, word)

    # Sending the request to OpenAI API with a simplified and direct prompt

    x = {"messages" : [
        {"role": "system", "content": "أنت خبير لغوي، يرجى تقديم توضيح للمعنى وتشكيل الكلمة بشكل مباشر."},
        {"role": "user", "content": prompt_direct_diacritics},
        {"role": "assistant", "content": target}
    ]}

    return x

# Example usage (make sure to define 'test' and 'dictionary' appropriately)
#x = generate_oneWord_diacritics(test[3][15], test[1], dictionary)


In [None]:
import string


def replace_word(word,sentence,word_index):
    sentence_split = sentence.split()
    sentence_split[word_index] = word
    return " ".join(sentence_split)

def update_diacritcs(LLM_output, word, sentence, word_index):
  LLM_output = LLM_output.translate(str.maketrans('', '', string.punctuation))
  diacriticized_words = LLM_output.split()
  for diacriticized_word in diacriticized_words:
    if (remove_diacritics(diacriticized_word)[0][0][0] == word) & (len(remove_diacritics(diacriticized_word)[0][0][0]) != len(diacriticized_word)):
      y = replace_word(diacriticized_word,sentence,word_index)
      return y
  return sentence

def create_diacritics(inputSentence, dictionary):
  output = inputSentence[0]
  for word_index, stemmed_word, word in inputSentence[2]:
    LLM_output = generate_oneWord_diacritics(word,stemmed_word, output, dictionary)
    #output = update_diacritcs(LLM_output, word, output, word_index)
  return LLM_output



In [None]:
#create_diacritics(test[1:], dictionary)

In [None]:
#test[1:][2][0][2]

In [None]:
#test[3][15][0]

In [None]:
#remove_punctuation(remove_diacritics(diacriticized_word[4]))[0][0][0] == test[3][15][2]


## **Bulk test**

In [None]:
#test = train_data.take(500)[index], remove_extra_spaces_and_reconstruct(remove_diacritics(train_data.take(500)[index]))[0], results[index][0], new_tuples
#stem_sentence(remove_extra_spaces_and_reconstruct(remove_diacritics(train_data.take(50)[index]))[0], results[index][0],  results[index][1])


def save_checkpoint(data, filename):
    """Save the data to a file."""
    with open(filename, 'a') as file:  # 'a' mode to append to the file
        for line in data:
            file.write(line + '\n')
dataset = []
# Initialize a list to store the outputs
outputs = []
data = train_data.take(501)
for index in range(500):
    target = data[index]
    filtered_text = remove_extra_spaces_and_reconstruct(remove_diacritics(target))[0]
    sample = stem_sentence(filtered_text, results[index][0], results[index][1])
    output = sample[0]
    for word_index, stemmed_word, word in sample[2]:

      LLM_output = generate_oneWord_diacritics(word,stemmed_word, output, dictionary,data[index].split()[word_index])
      dataset.append(LLM_output)

#    generated_text = create_diacritics(sample, dictionary)

    # Collect the current iteration's output

#    # Checkpoint every 10 iterations
#    if (index + 1) % 10 == 0:
#        save_checkpoint(outputs, f'/content/drive/MyDrive/BulkTestGPT35AlmaanyDictionarySinglePromptFinetunedDataset/Dataset_checkpoint_{index // 10}.txt')
#        outputs = []  # Reset the outputs list for the next batch
#
## Save any remaining outputs after the final iteration
#if outputs:
#    save_checkpoint(outputs, f'/content/drive/MyDrive/BulkTestGPT35AlmaanyDictionarySinglePromptFinetuned/Dataset_checkpoint_final.txt')



In [None]:
dataset

## error check the data

In [None]:
!pip -q install datasets tiktoken openai

In [None]:
# Format error checks
import json
import os
import tiktoken
import numpy as np
from collections import defaultdict

format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

In [None]:
# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

In [None]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See pricing page to estimate total costs")


In [None]:
import json

def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

In [None]:
# Specify the full path including the file name and extension
file_path = '/content/drive/MyDrive/BulkTestGPT35AlmaanyDictionarySinglePromptFinetunedDataset/finetuning_dataset.jsonl'

# Now, call your function with the corrected path
save_to_jsonl(dataset, file_path)


## Create Train Test Data


In [None]:
pip install pandas scikit-learn

In [None]:
from google.colab import drive
# Mount Google Drive (follow the link and enter the authorization code)
drive.mount('/content/drive')

In [None]:

import pandas as pd

# Load the dataset
df = pd.read_json('/content/drive/MyDrive/BulkTestGPT35AlmaanyDictionarySinglePromptFinetunedDataset/finetuning_dataset.jsonl', lines=True)

# Display the first few rows of the dataframe
print(df.head())

from sklearn.model_selection import train_test_split

# Split the data
train_df, validation_df = train_test_split(df, test_size=0.1)  # 80% training, 20% validation

# You can adjust the `test_size` parameter as needed
train_df.to_json('/content/drive/MyDrive/BulkTestGPT35AlmaanyDictionarySinglePromptFinetunedDataset/train_finetuning_dataset.jsonl', orient='records', lines=True)
validation_df.to_json('/content/drive/MyDrive/BulkTestGPT35AlmaanyDictionarySinglePromptFinetunedDataset/val_finetuning_dataset.jsonl', orient='records', lines=True)
