In [1]:
pip list

Package                   Version
------------------------- --------------
absl-py                   2.1.0
aiohappyeyeballs          2.4.0
aiohttp                   3.10.5
aiosignal                 1.3.1
anyio                     4.4.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
astunparse                1.6.3
async-lru                 2.0.4
attrs                     24.2.0
babel                     2.16.0
beautifulsoup4            4.12.3
bleach                    6.1.0
certifi                   2024.7.4
cffi                      1.17.0
charset-normalizer        3.3.2
colorama                  0.4.6
comm                      0.2.2
contourpy                 1.2.1
cycler                    0.12.1
datasets                  2.21.0
debugpy                   1.8.5
decorator                 5.1.1
defusedxml                0.7.1
dill                      0.3.8
executing                 2.0.1
fastjsonschema   

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
model = model.cuda()

In [4]:
input_sentence = "This is a test text."
token_ids = tokenizer.encode(input_sentence, return_tensors = "pt").cuda()
token_ids

tensor([[1494,  339,  259,  262, 2978, 7461,  260,    1]], device='cuda:0')

In [5]:
model_out = model.generate(token_ids)
print(model_out)

tensor([[     0, 250099,      1]], device='cuda:0')




In [6]:
example_input_str = '<jp> This is just a test nbuig.'
# example_input_str = 'これは普通のテスト'
input_ids = tokenizer.encode(example_input_str, return_tensors='pt')
print('Input IDs:', input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print('Tokens:', tokens)

Input IDs: tensor([[1042, 3889,  669, 1494,  339, 1627,  259,  262, 2978,  259,  272, 1982,
         1315,  260,    1]])
Tokens: ['▁<', 'jp', '>', '▁This', '▁is', '▁just', '▁', 'a', '▁test', '▁', 'n', 'bu', 'ig', '.', '</s>']


In [7]:
# sorted(tokenizer.vocab.items(), key=lambda x: x[1])

In [8]:
from datasets import load_dataset

ds = load_dataset("odunola/spanish-english-pairs")

In [9]:
type(ds)

datasets.dataset_dict.DatasetDict

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['english', 'spanish'],
        num_rows: 139013
    })
})

In [1]:
import pandas as pd

df = pd.read_csv("hf://datasets/odunola/spanish-english-pairs/data.csv")

In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139013 entries, 0 to 139012
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  139013 non-null  object
 1   spanish  139013 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB


In [3]:
df['english']

0                                                       Go.
1                                                       Go.
2                                                       Go.
3                                                       Go.
4                                                       Hi.
                                ...                        
139008    A carbon footprint is the amount of carbon dio...
139009    Since there are usually multiple websites on a...
139010    If you want to sound like a native speaker, yo...
139011    It may be impossible to get a completely error...
139012    One day, I woke up to find that God had put ha...
Name: english, Length: 139013, dtype: object

In [6]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df["english"], df["spanish"], test_size=0.2, random_state=42)

In [11]:
X_train

91785                    She's concerned about your safety.
63860                          I'll be there rain or shine.
137495    There are usually never many cars on the road ...
132868    Tom's French improved after he started studyin...
114924           It's getting harder for me to concentrate.
                                ...                        
110268             No matter where you go, I'll follow you.
119879         You can see the setting sun from the window.
103694               He does not have any relatives at all.
131932    Is there something in particular that you want...
121958       His aunt takes care of his dog during the day.
Name: english, Length: 111210, dtype: object