In [12]:
!pip install pinecone-client==3.1.0 \
  pinecone-datasets==0.7.0 \
  sentence-transformers==2.2.2 \
  pinecone-notebooks==0.1.1 \
  datasets \
  bert_score \
  rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=08ceba23fbbcbe926a7f96aa5169915958c0e254be1742e3e0318b38d1b4b8f3
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [1]:
import requests
from pinecone import Pinecone, ServerlessSpec
from transformers import AutoTokenizer, AutoModel
import torch

# Pinecone Initialization
pc = Pinecone(api_key="9214c316-cd0b-461d-8851-618d9ccfb69a")
index = pc.Index("arabic-dialect-translation")


# Load embedding model
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base")
model = AutoModel.from_pretrained("intfloat/e5-base")

def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().tolist()

def query_pinecone(query_text, top_k=20):
    query_embedding = generate_embedding(query_text)
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    return results

def generate_prompt(query_text, pinecone_results):
  """
    Generates a prompt that includes retrieved examples and few-shot examples.

    Args:
    - query_text (str): The input query in dialectal Arabic.
    - pinecone_results (list): Retrieved relevant examples from Pinecone.

    Returns:
    - str: A formatted prompt to guide the model's response.
  """
  few_shot_examples = """
    - Sentence: "وينك؟ ما شفناك من زمان"
      Dialect: [Gulf]
      Translation: "أين أنت؟ لم نرك منذ فترة طويلة"

    - Sentence: "شو عم تعمل؟ ما شفتك من زمان"
      Dialect: [Levantine]
      Translation: "ماذا تفعل؟ لم أرك منذ فترة طويلة"

    - Sentence: "واش راك؟ ما شفتكش من مدة "
      Dialect: [North Africa]
      Translation: "كيف حالك؟ لم أرك منذ فترة "

    - Sentence: "مّا وبّا ... شحال من عام و هوما يحّجرو ( يْلَّمو ) ف الدراهم باش ينجمو يشرو ديك الماشينة ، و ف النهار اللول اللي كانو غادي يخدمو بيها ، حتى حّنّا ( الجدة ) كانت معروضة باش تجي تشوف الماشينة .  . "
      Dialect: [North Africa]
      Translation: "كان والدي ووالدتي يدخرون المال لهذا اليوم منذ سنوات لكي يستطيعوا شراء هذه الغسالة وفي اول يوم لاستخدامها دعيت جدتي لحضور ذلك الحدث لكي ترى هذه الآلة  "

    - Sentence: "إزيّك؟ ما شفتكش بقالك كتير"
      Dialect: [Egyptian]
      Translation: "كيف حالك؟ لم أرك منذ فترة طويلة"

    - Sentence: "كيف حالك؟ لم أرك منذ فترة طويلة"
      Dialect: [Stand arabic]
      Translation: "كيف حالك؟ لم أرك منذ فترة طويلة"

    """


  retrieved_examples = "\n".join(
      [f"{i+1}. \"{match['metadata']['text']}\" - Dialect: [{match['metadata']['label']}], Translation: \"{match['metadata']['translation']}\""
      for i, match in enumerate(pinecone_results['matches'])]
      )

  #The following are examples of Arabic dialect sentences with their dialect name and their translations in standard Arabic:

  prompt = f"""
  Your are dialect translator assistant you have the following examples of Arabic dialect sentences and their translations in standard Arabic:
  {few_shot_examples}

  Based on the following retrieved examples:
  {retrieved_examples}

  Identify the dialect and provide the translation for the following sentence to only and only standard Arabic giving its dialect:

  Sentence: "{query_text}"
  """

  return prompt.strip()

def parse_response(response):
    """
    Parses the model's response and returns a well-organized string with non-empty components.

    Args:
    - response (str): The model's response.

    Returns:
    - str: An organized string with only the non-empty components (Sentence, Dialect, Translation).
    """
    # Initialize components as empty strings
    sentence, dialect, translation = "", "", ""

    # Split response into lines
    lines = response.strip().split("\n")

    # Iterate through each line to find specific components
    for line in lines:
        if "Sentence:" in line:
            sentence = line.split(":", 1)[1].strip('" ').strip()
        elif "Dialect:" in line:
            dialect = line.split(":", 1)[1].strip("[] ").strip()
        elif "Translation:" in line:
            translation = line.split(":", 1)[1].strip('" ').strip()

    # Build the organized response string with non-empty components
    response = ""
    if sentence:
        response += f"Sentence: \"{sentence}\"\n"
    if dialect:
        response += f"Dialect: [{dialect}]\n"
    if translation:
        response += f"Translation: \"{translation}\""

    return response.strip()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [2]:
API_KEY="xK7NstzjTVVcAmjsI4OomLGRkt9a7vZUO9sChIVBWXLs"
ENDPOINT = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"

In [3]:
import sys
import requests
from requests.exceptions import RequestException

def get_access_token():
    token_url = "https://iam.cloud.ibm.com/identity/token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
            "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
            "apikey": API_KEY
    }
    response = requests.post(token_url, headers=headers, data=data)

    if response.status_code==200:
        return response.json()["access_token"]

    else:
        print(f"Error obtaining access token: {response.text}")

ACCESS_TOKEN = get_access_token()
print(ACCESS_TOKEN)

eyJraWQiOiIyMDI0MTEwMTA4NDIiLCJhbGciOiJSUzI1NiJ9.eyJpYW1faWQiOiJJQk1pZC02OTUwMDBLNFdLIiwiaWQiOiJJQk1pZC02OTUwMDBLNFdLIiwicmVhbG1pZCI6IklCTWlkIiwianRpIjoiMmNkZWMyNDYtZGQxNi00MmZlLWJmYjItYTVjYmVjOTJlZGI0IiwiaWRlbnRpZmllciI6IjY5NTAwMEs0V0siLCJnaXZlbl9uYW1lIjoiU2FuYSIsImZhbWlseV9uYW1lIjoiQk9VQ0hFVEEiLCJuYW1lIjoiU2FuYSBCT1VDSEVUQSIsImVtYWlsIjoic2FuYS5ib3VjaGV0YUBnbWFpbC5jb20iLCJzdWIiOiJzYW5hLmJvdWNoZXRhQGdtYWlsLmNvbSIsImF1dGhuIjp7InN1YiI6InNhbmEuYm91Y2hldGFAZ21haWwuY29tIiwiaWFtX2lkIjoiSUJNaWQtNjk1MDAwSzRXSyIsIm5hbWUiOiJTYW5hIEJPVUNIRVRBIiwiZ2l2ZW5fbmFtZSI6IlNhbmEiLCJmYW1pbHlfbmFtZSI6IkJPVUNIRVRBIiwiZW1haWwiOiJzYW5hLmJvdWNoZXRhQGdtYWlsLmNvbSJ9LCJhY2NvdW50Ijp7InZhbGlkIjp0cnVlLCJic3MiOiJmNDYzN2YzYjcwNzc0MDhmYjFlMTM1ZTQxYzY3OTY0MyIsImltc191c2VyX2lkIjoiMTI2Nzc5MjUiLCJmcm96ZW4iOnRydWUsImltcyI6IjI3NDY5ODYifSwiaWF0IjoxNzMxMTMzNTIwLCJleHAiOjE3MzExMzcxMjAsImlzcyI6Imh0dHBzOi8vaWFtLmNsb3VkLmlibS5jb20vaWRlbnRpdHkiLCJncmFudF90eXBlIjoidXJuOmlibTpwYXJhbXM6b2F1dGg6Z3JhbnQtdHlwZTphcGlrZXkiLCJzY29wZSI6ImlibSB

In [4]:
def query_watsonx(prompt):
    headers = {
        "Authorization": f"Bearer {ACCESS_TOKEN}",
        "Content-Type": "application/json"
    }
    body = {
            "input": prompt,
            "parameters": {
                "decoding_method": "greedy",
                "max_new_tokens": 900,
                "repetition_penalty": 1.05,
            },
            "model_id": "sdaia/allam-1-13b-instruct",
            "project_id": "4d029c5c-71f8-459d-aa75-19f113997547"
        }

    response = requests.post(ENDPOINT, headers=headers, json=body)
    if response.json().get("results"):
        return response.json().get("results")[0].get("generated_text")
    else:
        return "No response available."


In [5]:
def process_query(query_text):
    pinecone_results = query_pinecone(query_text)
    prompt = generate_prompt(query_text, pinecone_results)
    response = query_watsonx(prompt)
    return  parse_response(response)


In [6]:
response = process_query("عمري ما سامع بهيج عنوان بهل المنطقة")
print(response)


Dialect: [Levantine]
Translation: "لم أسمع بهذا العنوان في هذه المنطقة من قبل"


In [23]:


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

import json
import random
with open('./test_dataset_2.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)


def evaluate_pipeline(data):
    """
    Evaluates the pipeline on a given dataset, ensuring robust handling of incomplete responses.

    Args:
    - data (list): A list of dictionaries containing `text`, `translation`, and `label`.

    Prints evaluation metrics for dialect identification and translation quality.
    """
    true_labels = []
    predicted_labels = []
    reference_translations = []
    candidate_translations = []

    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge-1', 'rouge-2', 'rouge-l'], use_stemmer=True)
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    for item in data:
        query_text = item['text']
        true_labels.append(item['label'])  # True dialect label
        reference_translations.append([item['translation']])  # Reference translation

        response = process_query(query_text)

        # Parse response and extract components
        predicted_label = ""
        predicted_translation = ""

        # Dynamically extract parsed components
        for line in response.split("\n"):
            if "Dialect:" in line:
                predicted_label = line.split(":")[1].strip("[] ").strip()
            elif "Translation:" in line:
                predicted_translation = line.split(":")[1].strip('" ').strip()

        predicted_labels.append(predicted_label)
        candidate_translations.append(predicted_translation)
        # print(predicted_labels)
        # rouge_scores = rouge_scorer_instance.score(reference_translations, predicted_translation)
        # rouge_1_scores.append(rouge_scores['rouge-1'].fmeasure)
        # rouge_2_scores.append(rouge_scores['rouge-2'].fmeasure)
        # rouge_l_scores.append(rouge_scores['rouge-l'].fmeasure)

    # Summarized metrics for dialect Detection
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro', zero_division=0)
    recall = recall_score(true_labels, predicted_labels, average='macro', zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average='macro', zero_division=0)

    print("\nOverall Dialect Detection Metrics:")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    # Evaluate Translation Quality
    print("\nEvaluate Translation Quality")
    # print(f"BLEU score: {bleu.score}")
    bleu_score = corpus_bleu(reference_translations, candidate_translations) * 100
    print(f"\nBLEU Score: {bleu_score:.2f}")

    # print(f"\nROUGE Scores:")
    # print(f"ROUGE-1 F1 Score: {sum(rouge_1_scores) / len(rouge_1_scores):.2f}")
    # print(f"ROUGE-2 F1 Score: {sum(rouge_2_scores) / len(rouge_2_scores):.2f}")
    # print(f"ROUGE-L F1 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.2f}")




In [None]:
evaluate_pipeline(test_data)