In [None]:
import numpy as np
import pandas as pd 

from nltk.tokenize import sent_tokenize

from sklearn.model_selection import train_test_split

from tqdm import tqdm
tqdm.pandas()

In [2]:
path="../data/"
train = pd.read_parquet(path+"train.parquet")

train, test = train_test_split(train, test_size=0.2, random_state=1)
eval, test = train_test_split(test, test_size=0.5, random_state=1)

sub = pd.read_csv(path+"sample_submission.csv")

In [3]:
print(f"{train.shape}")
train.head(2)

(38751, 8)


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
18632,623e2aa50f13aece7fb1c60cf555d8c7d290692f679655...,Phân tích kết quả sản xuất (KQSX) về chất lượn...,Để phân tích kết quả sản xuất (KQSX) về chất l...,Phân tích kết quả sản xuất (KQSX) về chất lượn...,model_a,yi-lightning,internlm2_5-20b-chat,Vietnamese
34859,b87079eef09873032f653314cfddd05e7bf77e4e6d0449...,Write a sample of Old Uyghur,Providing a truly representative sample of Old...,"I apologize, but I cannot generate an authenti...",model_a,gemini-1.5-flash-002,claude-3-5-sonnet-20240620,English


In [32]:
import nltk 
nltk.download("punkt_tab")

def get_n_sents(line: str) -> int:
    return len(sent_tokenize(line))

def get_sents(line: str) -> int:
    return sent_tokenize(line)

prompt_sents = train['prompt'].progress_apply(get_sents)
response_a_sents = train['response_a'].progress_apply(get_sents)
response_b_sents = train['response_b'].progress_apply(get_sents)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ya.pristalov/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
100%|██████████| 38751/38751 [00:01<00:00, 20317.58it/s]
100%|██████████| 38751/38751 [00:05<00:00, 6673.76it/s]
100%|██████████| 38751/38751 [00:05<00:00, 6689.03it/s]


In [47]:
N_SENTS_THRESHOLD = 1
prompt_sents = prompt_sents[prompt_sents.apply(len) > N_SENTS_THRESHOLD]
response_a_sents = response_a_sents[response_a_sents.apply(len) > N_SENTS_THRESHOLD]
response_b_sents = response_b_sents[response_b_sents.apply(len) > N_SENTS_THRESHOLD]

print(len(prompt_sents), len(response_a_sents), len(response_b_sents))

sentences = pd.concat([prompt_sents, response_a_sents, response_b_sents])
print(sentences.shape)

plain_sentences = []
def make_sents_plain(sents_list: list[str], plain_sentences: list[str]):
    plain_sentences.extend(sents_list)
sentences.apply(make_sents_plain, args=(plain_sentences,))
print(len(plain_sentences))

18014 34784 34776
(87574,)
1566154


In [48]:
def build_positive_pairs(
        raw_sentences  # : pd.Series[list[str]]
    ) -> list[list[str]]:
    positive_pairs = []

    def extend_positive_pairs(sent_list: list[str], positive_pairs: list[list[str]]):
        for i in range(len(sent_list) - 1):
            positive_pairs.append([sent_list[i], sent_list[i + 1], 1])

    raw_sentences.apply(extend_positive_pairs, args=(positive_pairs,))
    return positive_pairs

positive_pairs = build_positive_pairs(sentences)
print(len(positive_pairs))
positive_pairs[:3]

1478580


[['For this argument, consequentialism is like kinetic theory of gases.',
  "The point is not that it's wrong and doesn't work (where it should), but that it's not a relevant tool for many purposes.",
  1],
 ["The point is not that it's wrong and doesn't work (where it should), but that it's not a relevant tool for many purposes.",
  'I started giving up on consequentialism when thinking about concepts of alignment like corrigibility and then membranes (respect for autonomy).',
  1],
 ['I started giving up on consequentialism when thinking about concepts of alignment like corrigibility and then membranes (respect for autonomy).',
  "They could in principle be framed as particular preferences, but that doesn't apear to be a natural way of thinking about them, of formulating them more clearly.",
  1]]

In [49]:
plain_sentences.__len__()

1566154

In [50]:
negative_pairs = []
choice_range_max = len(plain_sentences)
n_negative_per_one_positive = 1

for pair in tqdm(positive_pairs):
    for _ in range(n_negative_per_one_positive):
        negative_pairs.append(
            [
                pair[0], 
                plain_sentences[np.random.randint(0, choice_range_max)],
                0
            ]
        )

  4%|▍         | 58449/1478580 [00:00<00:02, 584473.84it/s]

100%|██████████| 1478580/1478580 [00:02<00:00, 505089.12it/s]


In [51]:
negative_pairs[:10]

[['For this argument, consequentialism is like kinetic theory of gases.',
  '2.',
  0],
 ["The point is not that it's wrong and doesn't work (where it should), but that it's not a relevant tool for many purposes.",
  'Questi file sono chiamati template files.',
  0],
 ['I started giving up on consequentialism when thinking about concepts of alignment like corrigibility and then membranes (respect for autonomy).',
  '**Smarty Cache Configuration**: Ensure that Smarty caching is properly configured.',
  0],
 ["They could in principle be framed as particular preferences, but that doesn't apear to be a natural way of thinking about them, of formulating them more clearly.",
  '86.',
  0],
 ['Even in decision theory, with the aim of getting certain outcomes to pass, my current preferred ontology of simulation-structure of things points more towards convincing other computations to move the world in certain ways than towards anticipating their behavior before they decide what it should be the

In [52]:
len(positive_pairs), len(negative_pairs)

(1478580, 1478580)

In [56]:
all_pairs: pd.DataFrame = pd.concat([
    pd.DataFrame(positive_pairs),
    pd.DataFrame(negative_pairs),  
])

all_pairs.columns = ['start', 'next', 'label']
dataset = all_pairs.sample(frac=1.0, random_state=42).reset_index(drop=True)
dataset

Unnamed: 0,start,next,label
0,FAUX : Lors du dépistage du VIH par les tests ...,3.,0
1,What worked well?,What could you have done better?,1
2,* **Customer Segmentation and Profiling:** Gen...,- The Slaver's speech to the crowd is particul...,0
3,"Luật này quy định về thủ tục, hồ sơ, thẩm quyề...",`using System;`: Dòng này khai báo sử dụng khô...,0
4,Ensure Reproducibility and Ethical Considerati...,8.,0
...,...,...,...
2957155,### Prompt:\nDiscuss the key differences and p...,Their work was physically demanding and often ...,0
2957156,知識渦** (Cerebral Vortex)\n肥大化した脳を高速回転させ、周囲に強烈な空...,U.N. and O.A.S.,0
2957157,Here are 10 (or more) to capture that bashful ...,final now = DateTime.now();\n const exitWar...,0
2957158,Астрид указала на полку с книгами о садоводстве.,**応用力（問3）**\n - **得点**: 8/10\n - **評価**: エ...,0


In [57]:
dataset.to_csv('nsp.csv', index=False)