# Libraries

In [None]:
%%capture
!pip install datasets yandexfreetranslate PySocks pandarallel

In [None]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from tqdm.auto import tqdm
from yandexfreetranslate import YandexFreeTranslate
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=2)
tqdm.pandas(desc='My bar!')

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# MRPC

In [None]:
dataset = load_dataset('glue', 'mrpc')

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:
from collections import Counter
for split in dataset.keys():
    print(Counter(dataset[split]['label']))


Counter({1: 2474, 0: 1194})
Counter({1: 279, 0: 129})
Counter({1: 1147, 0: 578})


In [None]:
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

In [None]:
yt = YandexFreeTranslate(api = "ios")
def translate(row):
    row['sentence1'] = yt.translate('en', 'kk', row['sentence1'])
    row['sentence2'] = yt.translate('en', 'kk', row['sentence2'])
    return row

In [None]:
# Translate dataset to Kazakh
kk_train_df = df_train.parallel_apply(translate, axis=1)
kk_val_df = df_val.parallel_apply(translate, axis=1)
kk_test_df = df_test.parallel_apply(translate, axis=1)

My bar!:   0%|          | 0/10 [00:00<?, ?it/s]

My bar!:   0%|          | 0/10 [00:00<?, ?it/s]

My bar!:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
kk_mrpc = DatasetDict({
    'train': Dataset.from_pandas(kk_train_df),
    'validation': Dataset.from_pandas(kk_val_df),
    'test': Dataset.from_pandas(kk_test_df)
})
kk_mrpc.save_to_disk('kk_mrpc')

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 10
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 10
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 10
    })
})

# Quora Question Pairs

In [None]:
quora = load_dataset("quora")

Downloading data:   0%|          | 0.00/35.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [None]:
from collections import Counter
Counter(quora['train']['is_duplicate'])

Counter({False: 255027, True: 149263})

In [None]:
quora_shuffled = quora['train'].shuffle(seed=42)  # Use a seed for reproducibility

# Select the first 10,000 rows
quora_sampled = quora_shuffled.select(range(10000))

In [None]:
quora_new = {
    'sentence1': [],
    'sentence2': [],
    'label': [],
    'idx': []
}

for row in quora_sampled:
    quora_new['sentence1'].append(row['questions']['text'][0])
    quora_new['sentence2'].append(row['questions']['text'][1])
    quora_new['label'].append(int(row['is_duplicate']))
    quora_new['idx'].append(row['questions']['id'][0])

In [None]:
quora_df = pd.DataFrame(quora_new)

In [None]:
# start index from 0 to len(df)
quora_df['idx'] = range(len(quora_df))

In [None]:
from sklearn.model_selection import train_test_split

quora_train, quora_val_test = train_test_split(quora_df, test_size=0.3, random_state=42)
quora_val, quora_test = train_test_split(quora_val_test, test_size=0.67, random_state=42)
quora_train.shape, quora_val.shape, quora_test.shape

((7000, 4), (989, 4), (2011, 4))

In [None]:
# Translate dataset to Kazakh
kk_quora_train = quora_train.parallel_apply(translate, axis=1)
kk_quora_val = quora_val.parallel_apply(translate, axis=1)
kk_quora_test = quora_test.parallel_apply(translate, axis=1)

In [None]:
kk_quora_10k = DatasetDict({
    'train': Dataset.from_pandas(kk_quora_train),
    'validation': Dataset.from_pandas(kk_quora_val),
    'test': Dataset.from_pandas(kk_quora_test)
})

kk_quora_10k.save_to_disk('kk_quora_10k')

# Concatenate the two datasets

In [None]:
from datasets import load_from_disk, DatasetDict, concatenate_datasets

In [None]:
kk_mrpc = load_from_disk('kk_mrpc')
kk_quora = load_from_disk('kk_quora_10k')

In [None]:
kk_mrpc = kk_mrpc.remove_columns(['idx'])
kk_mrpc

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 1725
    })
})

In [None]:
kk_quora = kk_quora.remove_columns(['__index_level_0__', 'idx'])
kk_quora

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 989
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2011
    })
})

In [None]:
# sentence similary dataset
ssd = DatasetDict({
    'train': concatenate_datasets([kk_mrpc['train'], kk_quora['train']]),
    'validation': concatenate_datasets([kk_mrpc['validation'], kk_quora['validation']]),
    'test': concatenate_datasets([kk_mrpc['test'], kk_quora['test']])
})

ssd

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 10668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 1397
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 3736
    })
})

In [None]:
for split in ssd.keys():
    for sent in ssd[split]['sentence1']:
        if '\n' in sent:
            print(split, sent)

    for sent in ssd[split]['sentence2']:
        if '\n' in sent:
            print(split, sent)

In [None]:
ssd.save_to_disk('sentence_similarity_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/10668 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1397 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3736 [00:00<?, ? examples/s]

STS-B

In [None]:
stsb = load_dataset("SetFit/stsb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/417 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/298k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
stsb_val = stsb['validation'].rename_column('text1', 'sentence1').rename_column('text2', 'sentence2')

In [None]:
stsb_val_df = pd.DataFrame(stsb_val)

In [None]:
stsb_val_df

Unnamed: 0,sentence1,sentence2,label,idx,label_text
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.00,0,
1,A young child is riding a horse.,A child is riding a horse.,4.75,1,
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.00,2,
3,A woman is playing the guitar.,A man is playing guitar.,2.40,3,
4,A woman is playing the flute.,A man is playing a flute.,2.75,4,
...,...,...,...,...,...
1495,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?,2.00,1495,
1496,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations,0.00,1496,
1497,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o...",2.00,1497,
1498,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...,0.00,1498,


In [33]:
kk_stsb_val = stsb_val_df.parallel_apply(translate, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=750), Label(value='0 / 750'))), HB…

In [34]:
kk_stsb_val

Unnamed: 0,sentence1,sentence2,label,idx,label_text
0,Шляпасы бар адам билеп жатыр.,Дулыға киген адам билеп жатыр.,5.00,0,
1,Кішкентай бала атқа мініп жүр.,Бала атқа мініп жүр.,4.75,1,
2,Ер адам тышқанды жыланға тамақтандырып жатыр.,Ер адам тышқанды жыланға тамақтандырып жатыр.,5.00,2,
3,Бір әйел гитарада ойнап жүр.,Ер адам гитарада ойнап жүр.,2.40,3,
4,Флейтада бір әйел ойнап жүр.,Ер адам флейтада ойнайды.,2.75,4,
...,...,...,...,...,...
1495,Ғалымдар Марста су бар екенін дәлелдеді,Nasa Марста су тапты ма?,2.00,1495,
1496,Пранаб ұлттар тарапынан бейбітшілікке ұмтылу қ...,ДСҰ: Үндістан дамыған елдердің әрекеттеріне өк...,0.00,1496,
1497,Фольксваген ластану жанжалына байланысты қызыл...,"Фольксвагеннің дизель отынының иелеріне ""ізгі ...",2.00,1497,
1498,Обама дұрыс айтады: Африка жақсы көшбасшылыққа...,Обама аралық сайлауда бас прокурорды тағайында...,0.00,1498,


In [35]:
kk_stsb_val = DatasetDict({
    'validation': Dataset.from_pandas(kk_stsb_val)
})

kk_stsb_val.save_to_disk('kk_stsb_val')

Saving the dataset (0/1 shards):   0%|          | 0/1500 [00:00<?, ? examples/s]

In [36]:
!zip -r kk_stsb_val.zip kk_stsb_val

  adding: kk_stsb_val/ (stored 0%)
  adding: kk_stsb_val/dataset_dict.json (stored 0%)
  adding: kk_stsb_val/validation/ (stored 0%)
  adding: kk_stsb_val/validation/state.json (deflated 39%)
  adding: kk_stsb_val/validation/dataset_info.json (deflated 68%)
  adding: kk_stsb_val/validation/data-00000-of-00001.arrow (deflated 71%)
