Install necessary libraries

In [None]:
!pip install datasets --quiet
!pip install pandas --quiet
!pip install transformers --quiet

In [None]:
from datasets import load_dataset
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

Load dataset

In [None]:
dataset = load_dataset("tatsu-lab/alpaca")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Convert dataset to DataFrame

In [None]:
df = pd.DataFrame(dataset['train'])
df_text = df['text']

Prepare DataFrame for translations

In [None]:
df = df.drop(columns=['text'])
df['SeychellesCreole_instruction'] = ''
df['SeychellesCreole_input'] = ''
df['SeychellesCreole_output'] = ''
df_text.head()

0    Below is an instruction that describes a task....
1    Below is an instruction that describes a task....
2    Below is an instruction that describes a task....
3    Below is an instruction that describes a task....
4    Below is an instruction that describes a task....
Name: text, dtype: object

Load the Hugging Face model and tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-crs"  # Update with the specific model for your target language
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/557k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/922k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Define translation function using OpusMT/Marian model

In [None]:
def translate_opus(input_text, target_lang="crs"): #Specify target language
    # Add the language token to the input text
    input_text_with_lang = f">>{target_lang}<< {input_text}"
    # Tokenize the input text
    encoded_input = tokenizer(input_text_with_lang, return_tensors="pt", padding=True)
    # Generate translated text
    translated_tokens = model.generate(**encoded_input)
    # Decode the translated tokens
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text

Calculate total characters in DataFrame

In [None]:
def total_characters_in_dataframe(df):
    total_characters = 0
    for value in df.values.flatten():
        total_characters += len(str(value))
    return total_characters

characters = total_characters_in_dataframe(df)
price = characters / 1000000 * 0  # free model (OS)
print(f"Total characters: {characters}")
print(f"Total price ${price}")

Total characters: 18357013
Total price $0.0


Translate each row in the DataFrame - set at 5 and can be changed

In [None]:
for i in range(len(df)):
    instruction = df.at[i, 'instruction']
    if instruction != '':
        SeychellesCreole_instruction = translate_opus(instruction)
        df.at[i, 'SeychellesCreole_instruction'] = SeychellesCreole_instruction

    input_text = df.at[i, 'input']
    if input_text != '':
        SeychellesCreole_input = translate_opus(input_text)
        df.at[i, 'SeychellesCreole_input'] = SeychellesCreole_input

    output = df.at[i, 'output']
    if output != '':
        SeychellesCreole_output = translate_opus(output)
        df.at[i, 'SeychellesCreole_output'] = SeychellesCreole_output

    # For demonstration, limit to first 5 rows - change if necessary
    if i >= 5:
        break

Display the DataFrame (only first few rows)

In [None]:
df.head()

Unnamed: 0,instruction,input,output,SeychellesCreole_instruction,SeychellesCreole_input,SeychellesCreole_output
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,Ganny de baz pour ki ou kapab reste an bonn sa...,,BANN avyon i manz byen e fer sir ki i enkli ba...
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Ki sa trwa kouler prensipal?,,"Sa trwa kouler prensipal i rouz, ble ek zonn."
2,Describe the structure of an atom.,,"An atom is made up of a nucleus, which contain...",Dekrir striktir striktir striktir striktir en ...,,Bokou bonm i ganny fer avek bann zarm ki annan...
3,How can we reduce air pollution?,,There are a number of ways to reduce air pollu...,Lo ki mannyer nou kapab redwir ler polisyon?,,Bann plant i annan plizyer fason pour redwir l...
4,Describe a time when you had to make a difficu...,,I had to make a difficult decision when I was ...,Dekrir en moman kot ou ti bezwen fer en desizy...,,Vi ki mon ti annan en desizyon difisil pour fe...


Save DataFrame to a CSV file

In [None]:
df.to_csv('SeychelloisCreole.csv', index=False)