# Load datasets

In [90]:
import json
import gzip
import os
import pandas as pd
import urllib.request

import copy
import torch
from torch.utils.data import Dataset, DataLoader, IterableDataset

In [91]:
torch.__version__

'1.8.0'

In [50]:
pd.__version__

'1.2.4'

In [51]:
datasets = pd.read_csv(
  'datasets_list.tsv', 
   index_col=0, 
   sep='\t',
   dtype={
     'Description': str,
     'Size (#Pairs)': str,
     'Performance': float,
     'Download link': str,
     'Source': str})
datasets['Size (#Pairs)'] = datasets['Size (#Pairs)'].str.replace(',', '').astype(int)
datasets = datasets.to_dict(orient='index')

In [69]:
datasets['stackexchange_duplicate_questions_title_title']

{'Description': '(Title, Title) pairs of duplicate questions from StackExchange',
 'Size (#Pairs)': 304525,
 'Performance': 58.47,
 'Download link': 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/stackexchange_duplicate_questions_title_title.jsonl.gz',
 'Source': nan}

In [63]:
def download_dataset(url, file_name):
  urllib.request.urlretrieve(url, file_name)
  return

In [81]:
def load_dataset(file_name):
  with gzip.open(file_name, "rb") as f:
    dataset = [json.loads(jline) for jline in f.read().splitlines()]
    return dataset

In [70]:
download_dataset(
  datasets['stackexchange_duplicate_questions_title_title']['Download link'],
  os.path.join(os.path.abspath('.'), 'data', 'stackexchange_duplicate_questions_title_title.json.gz'))

## Test

In [82]:
dataset_test = load_dataset(os.path.join(os.path.abspath('.'), 'data', 'stackexchange_duplicate_questions_title_title.json.gz'))

In [83]:
dataset_test[0]

['what is the advantage of using the GPU rendering options in Android?',
 'Can anyone explain all these Developer Options?']

In [85]:
len(dataset_test)

304525

In [95]:
dataset_itr = gzip.open(os.path.join(os.path.abspath('.'), 'data', 'stackexchange_duplicate_questions_title_title.json.gz'), "rb")

In [108]:
answer, question = json.loads(next(dataset_itr))
print(answer)
print(question)

How can I prove the following sequence converges
Find value of the limit: $\lim_{n\to \infty}\sqrt[n]{1^2+2^2+\cdots+n^2}$


In [None]:
dataset_itr.close()

## Download all datasets

In [128]:
for d in datasets.keys():
  print('Downloading dataset {}'.format(d))
  download_dataset(
    datasets[d]['Download link'],
    os.path.join(os.path.abspath('/data/asimouli/sentence_pairs'), d + 'json.gz'))

Downloading dataset stackexchange_title_body_small
Downloading dataset gooaq_pairs
Downloading dataset msmarco-query_passage_negative
Downloading dataset yahoo_answers_title_answer
Downloading dataset stackexchange_duplicate_questions_title_title
Downloading dataset msmarco-query_passage
Downloading dataset eli5_question_answer
Downloading dataset yahoo_answers_title_question
Downloading dataset squad_pairs
Downloading dataset yahoo_answers_question_answer
Downloading dataset NQ-train_pairs
Downloading dataset quora_duplicates
Downloading dataset WikiAnswers_pairs
Downloading dataset stackexchange_duplicate_questions_title-body_title-body
Downloading dataset S2ORC_citation_pairs
Downloading dataset stackexchange_duplicate_questions_body_body
Downloading dataset quora_duplicates_triplets
Downloading dataset AllNLI
Downloading dataset specter_train_triples
Downloading dataset SimpleWiki
Downloading dataset PAQ_pairs
Downloading dataset altlex
Downloading dataset CodeSearchNet
Downloading

## Iterable Dataset

In [125]:
class TextIterator:
    def __init__(self, text_iterator, batch_size, num_workers, transform=None):
        self.batch_size = batch_size
        self.iter_number = 0
        self.num_workers = num_workers
        self.text_iterator = text_iterator
        self.transform = transform

    def __iter__(self):
        return self.text_iterator

    def __next__(self):
        if self.iter_number == self.batch_size:
            self.iter_number = 0
            for _ in range(self.batch_size * (self.num_workers - 1)):
                next(self.text_iterator)
        self.iter_number += 1
        answer, question = json.loads(next(self.text_iterator))
        sample = {'question': question, 'answer': answer}
        sample = copy.deepcopy(sample)
        if self.transform:
            sample = self.transform(sample)
        return sample
      
    def __del__(self):
        self.text_iterator.close()


class TextSimpleIterator:
    def __init__(self, text_iterator, transform=None):
        self.text_iterator = text_iterator
        self.transform = transform

    def __iter__(self):
        return self.text_iterator

    def __next__(self):
        answer, question = json.loads(next(self.text_iterator))
        sample = {'question': question, 'answer': answer}
        if self.transform:
            sample = self.transform(sample)
        return sample
      
    def __del__(self):
        self.text_iterator.close()

class IterableCorpusDataset(IterableDataset):
    def __init__(self, file_path, batch_size, num_workers, start=0, transform=None):
        self.file_path = file_path
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.start = start
        self.transform = transform

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        dataset_itr = gzip.open(self.file_path, "rb")
        if worker_info is None:
            dataset_itr = gzip.open(self.file_path, "rb")
            for _ in range(self.start):
                next(dataset_itr)
            return TextSimpleIterator(dataset_itr, self.transform)
        else:
            worker_id = worker_info.id
            for _ in range(self.start):
                next(dataset_itr)
            for _ in range(self.batch_size * worker_id):
                next(dataset_itr)
            return TextIterator(dataset_itr, self.batch_size, self.num_workers, self.transform)
          
      

In [126]:
corpus_dataset = IterableCorpusDataset(
  file_path=os.path.join(os.path.abspath('.'), 'data', 'stackexchange_duplicate_questions_title_title.json.gz'), 
  batch_size=2,
  num_workers=0, 
  transform=None)
corpus_dataset_itr = iter(corpus_dataset)

In [127]:
next(corpus_dataset_itr)

{'question': 'Can anyone explain all these Developer Options?',
 'answer': 'what is the advantage of using the GPU rendering options in Android?'}