In [296]:
import warnings
warnings.filterwarnings('ignore')

In [297]:
import pandas as pd
import json

# Data loading and preprocessing

In [298]:
sts_columns_ix = [4, 5, 6]
sts_columns = ['similarity', 'Sentence1', 'Sentence2']
msr_columns = ['#1 String', '#2 String', 'Quality']
qqp_columns = ['question1', 'question2', 'is_duplicate']

In [319]:
sts_train = pd.read_csv('sts-train.csv', sep="\t", header=None, names=sts_columns, usecols=sts_columns_ix)
sts_test = pd.read_csv('sts-test.csv', sep="\t", header=None, names=sts_columns, usecols=sts_columns_ix)
msr_train = pd.read_csv('msr_paraphrase_train.csv', sep='\t', usecols=msr_columns)
msr_test = pd.read_csv('msr_paraphrase_test.csv', sep='\t', usecols=msr_columns)
qqp_train = pd.read_csv('qqp-train.tsv', sep='\t', usecols=qqp_columns)
qqp_test = pd.read_csv('qqp-test.tsv', sep='\t', usecols=qqp_columns)

## STS preprocess

In [300]:
sts_train['similar'] = (sts_train['similarity'] >= 3.0).astype(int).astype(str)
sts_test['similar'] = (sts_test['similarity'] >= 3.0).astype(int).astype(str)

In [301]:
sts_train = sts_train.dropna()
sts_test = sts_test.dropna()

## MSR preprocess

In [302]:
msr_train = msr_train.rename(columns={'#1 String': 'Sentence1', '#2 String': 'Sentence2', 'Quality': 'similar'})
msr_test = msr_test.rename(columns={'#1 String': 'Sentence1', '#2 String': 'Sentence2', 'Quality': 'similar'})

In [303]:
msr_train['similar'] = msr_train['similar'].astype(str)
msr_test['similar'] = msr_test['similar'].astype(str)

In [304]:
msr_train = msr_train.dropna()
msr_test = msr_test.dropna()
msr_train = msr_train[~msr_train['Sentence2'].str.contains('\t')]
msr_test = msr_test[~msr_test['Sentence1'].str.contains('\t')]

## QQP preprocess

In [305]:
qqp_train = qqp_train.rename(columns={'question1': 'Sentence1', 'question2': 'Sentence2', 'is_duplicate': 'similar'})
qqp_test = qqp_test.rename(columns={'question1': 'Sentence1', 'question2': 'Sentence2', 'is_duplicate': 'similar'})

In [306]:
qqp_train = qqp_train.groupby('similar', group_keys=False).apply(lambda x: x.sample(2500))
qqp_test = qqp_test.groupby('similar', group_keys=False).apply(lambda x: x.sample(500))

In [307]:
qqp_train = qqp_train.sample(frac=1)
qqp_test = qqp_test.sample(frac=1)

In [308]:
qqp_train = qqp_train.drop_duplicates()
qqp_test = qqp_test.drop_duplicates()

In [309]:
qqp_train_test_merge = qqp_train.merge(qqp_test, how='inner', indicator=True)
qqp_test = qqp_test.drop(qqp_train_test_merge.index)

In [310]:
qqp_train['similar'] = qqp_train['similar'].astype(str)
qqp_test['similar'] = qqp_test['similar'].astype(str)

## Visualize resulting datasets

In [311]:
sts_test

Unnamed: 0,similarity,Sentence1,Sentence2,similar
0,2.5,A girl is styling her hair.,A girl is brushing her hair.,0
1,3.6,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,1
2,5.0,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,1
3,4.2,A man is cutting up a cucumber.,A man is slicing a cucumber.,1
4,1.5,A man is playing a harp.,A man is playing a keyboard.,0
...,...,...,...,...
1374,0.0,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,0
1375,1.0,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",0
1376,1.0,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,0
1377,0.0,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,0


In [312]:
msr_test

Unnamed: 0,similar,Sentence1,Sentence2
0,1,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
1,1,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
2,1,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,0,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,0,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...
...,...,...,...
1690,0,"After Hughes refused to rehire Hernandez, he c...",Hernandez filed an Equal Employment Opportunit...
1691,0,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...
1692,0,Bethany Hamilton remained in stable condition ...,"Bethany, who remained in stable condition afte..."
1693,1,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE..."


In [313]:
qqp_test

Unnamed: 0,Sentence1,Sentence2,similar
35241,How can I build a website after registering a ...,Do you think I can build something from a doma...,0
16740,What is meant by scale and intensity of turbul...,What modules should I gradually learn and prac...,0
8065,What would a US vs Russia war be like?,How would a war between Russia and the US look...,1
35766,What are some abiotic and biotic factors? What...,What are examples of biotic and abiotic compon...,1
19835,What are some good yoga techniques for weight ...,What are the yoga asanas for weight loss?,1
...,...,...,...
12812,What is the proudest moment?,What is your proudest moment?,1
10558,What are some good ways to get rid of belly fat?,I've gained belly through years of inactivity ...,1
17699,Is Hillary Clinton a habitual liar?,Why is Hillary Clinton a pathological liar?,1
19404,Does Greek mythology need to be capitalized? Why?,"Why is the word ""I"" capitalized?",0


# Generate training data

In [314]:
def gen_train(df):
    return [
        {
            "text": f"[INST] <<SYS>>\nFor the given two sentences, classify them as semantically similar with 'yes' or 'no'\n<</SYS>>\n\nSentence 1:{row['Sentence1']}\nSentence 2:{row['Sentence2']}\nAre they semantically similar?:\n[/INST]Response:{'yes' if row['similar'] == '1' else 'no'}", 
        }
        for _, row in df.iterrows()
    ]

In [315]:
def to_jsonl(df, filename):
    with open(f'{filename}.jsonl', 'w') as f:
        for item in gen_train(df):
            json.dump(item, f)
            f.write('\n')

In [316]:
to_jsonl(sts_train, 'sts-train')
to_jsonl(sts_test, 'sts-test')

In [317]:
to_jsonl(msr_train, 'msr-train')
to_jsonl(msr_test, 'msr-test')

In [318]:
to_jsonl(qqp_train, 'qqp-train')
to_jsonl(qqp_test, 'qqp-test')