In [None]:
# Install the sentence-transformers library
!pip install -q sentence-transformers

# Import necessary libraries
import pandas as pd
import requests
import uuid
import json
import time


# Load the Flickr8k dataset containing the best captions into a Pandas DataFrame
flickr8k = pd.read_csv('/content/drive/MyDrive/MMAD/flickr8k_best_captions.csv')

# Display the first few rows of the dataset to ensure it loaded correctly
flickr8k.head()


In [None]:
# Your API subscription key for Microsoft Translator
#key = {your Azure key} 

# Endpoint for the Microsoft Translator Text API
endpoint = "https://api.cognitive.microsofttranslator.com"

# The Azure region where your resource is located
location = "southafricanorth"

# API path for the translation service
path = '/translate'

# Construct the full URL for the API request
constructed_url = endpoint + path

# List of African language codes to which English text will be translated
# African Languages:
#   1. Afrikaans: af
#   2. Amharic: am
#   3. Kinyarwanda: rw
#   4. Igbo: ig
#   5. Hausa: ha
#   6. Lingala: ln
#   7. Luganda: lug
#   8. Somali: so
#   9. Zulu: zu
#  10. Shona: sn
#  11. Chichewa (Nyanja): nya
#  12. Runyankore-Rukiga: run
#  13. Sesotho: st
#  14. Northern Sotho (Sepedi): nso
#  15. Tswana: tn
#  16. Xhosa: xh
#  17. Yoruba: yo

african_languages_code = [
    "af", "am", "rw", "ig", "ha", "ln", "lug", "so",
    "zu", "sn", "nya", "run", "st", "nso", "tn",
    "xh", "yo"
]

# Parameters for the API request
params = {
    'api-version': '3.0',
    'from': 'en',
    'to': african_languages_code
}

# Headers for the API request
headers = {
    'Ocp-Apim-Subscription-Key': key,
    'Ocp-Apim-Subscription-Region': location,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}


In [None]:
def get_batch_translate(params, body):
    """
    Translates a list of texts in batches using the Microsoft Translator Text API.

    Args:
        params (dict): Parameters for the API request, including source and target languages.
        body (list): A list of strings to be translated.

    Returns:
        list: A list of translated texts.
    """

    all_res = []  # List to store all translation results
    current_batch = []  # List to store the current batch of texts to be translated
    current_length = 0  # Counter for the total character length of the current batch
    char_limit = 2500  # Character limit for each batch request as per API constraints

    # Iterate over each text in the body list
    for text in body:
        text_length = len(str(text))  # Calculate the length of the current text

        # Check if adding this text exceeds the character limit
        if current_length + text_length <= char_limit:
            current_batch.append(text)  # Add text to the current batch
            current_length += text_length  # Update the total character length
        else:
            # Prepare the body for the API request with the current batch
            body = [{'text': item} for item in current_batch]

            # Make the API request for translation
            request = requests.post(constructed_url, params=params, headers=headers, json=body)
            response = request.json()

            try:
                # Extract the translated text from the response
                reponse_text = [item["translations"][0]["text"] for item in response]
            except Exception as e:
                # Handle any errors in parsing the response
                print("Unable to parse response.")
                print(e, response)
                break

            # Append the translated text to the results list
            all_res.append(reponse_text)

            # Start a new batch with the current text
            current_batch = [str(text)]
            current_length = text_length

    # Check if there are any remaining texts in the last batch
    if current_batch:
        body = [{'text': item} for item in current_batch]

        # Make the final API request for the last batch
        request = requests.post(constructed_url, params=params, headers=headers, json=body)
        response = request.json()

        try:
            reponse_text = [item["translations"][0]["text"] for item in response]
        except Exception as e:
            print("Unable to parse response.")
            print(e, response)

        all_res.append(reponse_text)

    # Flatten the list of lists into a single list of translated texts
    return [element for row in all_res for element in row]


In [None]:
def pprint(body):
    """
    Pretty prints a JSON object or Python dictionary.

    Args:
        body (dict or list): The JSON object or Python dictionary to be printed.

    Returns:
        None: This function prints the formatted JSON to the console.
    """
    # Use json.dumps to convert the dictionary to a pretty-printed JSON string
    print(json.dumps(
        body,
        sort_keys=True,
        ensure_ascii=False,
        indent=4,
        separators=(',', ': ')
    ))


In [None]:
# Create a DataFrame with the original English captions
df = pd.DataFrame(
    {'captions': flickr8k['best_caption']}
)

# Iterate over each African language code
for code in african_languages_code:
    # Update the 'to' parameter with the current language code
    params['to'] = code

    print(f"Running translations of captions to {code}\n......")

    # Batch translate the captions to the current language
    response = get_batch_translate(params, body=flickr8k['best_caption'])

    # Store the translated captions in a new column of the DataFrame
    df[f'translated_{code}'] = response
    print(f"Translated English captions to {code}")

    # Pause for 2 minutes to avoid hitting the API rate limit
    time.sleep(120)

# Save the DataFrame with translated captions to a CSV file
print("Translated English captions to 17 African languages.")
df.to_csv('/content/drive/MyDrive/MMAD/translated_african_captions.csv', index=False)

In [None]:
# Set up parameters for back-translation to English
params_en = {
    'api-version': '3.0',
    'from': '',  # Source language will be set dynamically in the loop
    'to': 'en'  # Target language is always English
}

# Create a new DataFrame to store the original captions and back-translations
new_df = pd.DataFrame(
    {'captions': df['captions']}
)

# Iterate over each African language code to back-translate the captions
for code in african_languages_code:
    params_en['from'] = code  # Set the source language to the current language code

    # Perform batch translation from the African language back to English
    new_df[f'{code}_to_en'] = get_batch_translate(params_en, body=df[f"translated_{code}"])

    # Pause for 2 minutes to avoid hitting the API rate limit
    time.sleep(120)

# Save the DataFrame with back-translated captions to a CSV file
new_df.to_csv('/content/drive/MyDrive/MMAD/back_translated_african_captions.csv', index=False)


In [None]:
# Import necessary modules from the sentence-transformers library
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained SentenceTransformer model
# This model will be used to generate sentence embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')


In [None]:
# Create a DataFrame to store embeddings of the original English captions
embd_df = pd.DataFrame(
    {'captions': new_df['captions'].apply(lambda x: model.encode(x, convert_to_tensor=True))}
)

# Iterate over each African language code to generate embeddings for back-translated captions
for code in african_languages_code:
    print(f"Embedding {code} captions")

    # Generate embeddings for the back-translated captions and store them in the DataFrame
    embd_df[f'{code}_to_en'] = new_df[f'{code}_to_en'].apply(lambda x: model.encode(x, convert_to_tensor=True))

# Save the DataFrame with caption embeddings to a CSV file
embd_df.to_csv('/content/drive/MyDrive/MMAD/back_translated_african_captions_embd.csv', index=False)


In [None]:
# Initialize a dictionary to store cosine similarities
cosine_similarities = {}
cosine_similarities['captions'] = new_df['captions']  # Include the original captions for reference

# Iterate over each African language code to compute cosine similarities
for code in african_languages_code:
    colName = f'{code}_sim'  # Name the column for the cosine similarity scores

    cosine_similarities[colName] = []  # Initialize an empty list to store similarities for this language

    # Calculate cosine similarity for each pair of original and back-translated embeddings
    for i in range(len(embd_df)):
        cos_sim = util.cos_sim(embd_df['captions'][i], embd_df[f'{code}_to_en'][i])  # Compute cosine similarity
        cosine_similarities[colName].append(cos_sim.item())  # Append the similarity score as a float

# Create a DataFrame from the cosine similarities dictionary
cosine_df = pd.DataFrame(cosine_similarities)

# Save the cosine similarity index to a CSV file
cosine_df.to_csv('/content/drive/MyDrive/MMAD/back_translated_similarity_index.csv', index=False)
