In [2]:
!pip install -qqq torch pandas numpy

In [6]:
# Standard libraries
import os
import csv

# Standard data processing libraries
import pandas as pd
import numpy as np

# ML libraries
import torch
from torch.utils.data import Dataset

# Project code
from lib.dataset import SBERTDataset


In [2]:
PROJECT_DIR = "./" # @param {type: "string"}
DATASET = "headlines" # @param {type: "string"}
DATA_DIR = os.path.join(PROJECT_DIR, "data/sem_eval_2016/", DATASET)

In [3]:
types_map = {
    'EQUI': 7,
    'OPPO': 6,
    'SPE1': 5,
    'SPE2': 4,
    'SIMI': 3,
    'REL': 2,
    'ALIC': 1,
    'NOALI': 0,
}

def types_to_int(types):
    return list(map(lambda x: types_map[x], types))

In [7]:
class SBERTDataset(Dataset):
    def __init__(self, file_path: str):
        self._data = pd.read_csv(
            file_path, sep="\t", keep_default_na=False, quoting=csv.QUOTE_NONE
        )
        
        self._x1 = self._data["x1"].tolist()
        self._x2 = self._data["x2"].tolist()

        self._types = self._get_encoded_types()
        self._scores = torch.tensor(self._data["y_score"]).float()

    def _get_encoded_types(self):
        types_as_int = types_to_int(self._data["y_type"].tolist())
        encoded_types = torch.nn.functional.one_hot(
            torch.tensor(types_as_int),
            num_classes=len(types_map)
        ).float()
        return encoded_types
    
    def __getitem__(self, index):
        a, b = self._x1[index], self._x2[index]
        x = torch.tensor([
            [a, b]
        ])
        y = (self._types[index], self._scores[index])
        return x, y
    
    def __len__(self):
        return self._types.shape[0]
