In [38]:
from torch.utils.data import Dataset
import os, pickle, torch

class DocumentDataset(Dataset):
    """Inherits the PyTorch Dataset class. Reads disc and returns camemBERT document representation"""
    def __init__(self,ds_loc):
        """ds_loc : folder containing the raw data"""
        self.loc = ds_loc
    
    def __len__(self):
        return len(os.listdir(self.loc))
    
    def __getitem__(self,index):
        filename = "{}{}.pickle".format(self.loc,str(index))
        doc = []
        with open(filename,"rb") as file:
            while True:
                try:
                    doc.append(pickle.load(file))
                except EOFError:
                    break
        sentences = doc[:-1]
        sentences = [tensor.squeeze(0) for tensor in sentences]
        sentences = torch.stack(sentences)
        annotations = doc[-1]
        return sentences,annotations

In [39]:
ds = DocumentDataset("data/camemBERT_representations/")

In [42]:
ds[3000][0].shape

torch.Size([31, 64])

In [5]:
from src.dataset import FactsOrAnalysisDatasetRNN
from transformers import CamembertTokenizer, CamembertModel
import torch

camembert = CamembertModel.from_pretrained('camembert-base')
tokeniser = CamembertTokenizer.from_pretrained('camembert-base')
dataset = "data/dataset_docs_facts_non_facts20200311.pickle"
dataset = FactsOrAnalysisDatasetRNN(dataset, tokeniser, n_read='all')

device = torch.device('cuda')

class Reduced(torch.nn.Module):
    def __init__(self,camembert):
        super(Reduced,self).__init__()
        self.camembert = camembert
        self.reduce = torch.nn.Linear(in_features=768, out_features=64)
        
    def forward(self,x):
        x = self.camembert(x)[1]
        x = self.reduce(x)
        return x

model = Reduced(camembert)
model.eval()
model.to(device)

Progress:   0%|          | 16/10360 [00:00<01:34, 109.97it/s]

Creating the dataset…


Progress: 100%|██████████| 10360/10360 [01:46<00:00, 97.69it/s] 


Reduced(
  (camembert): CamembertModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, e

In [12]:
doc = dataset[1262][0]
sent = doc[0].to(device)
model(sent.unsqueeze(0))

tensor([[-0.0005,  0.0043,  0.0175, -0.1169, -0.0297,  0.0551,  0.0058, -0.0594,
          0.0135, -0.0500,  0.0473,  0.0563, -0.0551,  0.0020,  0.0351,  0.0240,
          0.0163,  0.1602, -0.0492, -0.0840, -0.0257,  0.0662, -0.0040,  0.0511,
          0.1231,  0.0223, -0.0282, -0.0311,  0.0625, -0.0564,  0.0213, -0.0012,
         -0.0786, -0.0252,  0.0932, -0.0345,  0.0447, -0.0844, -0.0652, -0.0240,
         -0.1026,  0.0322, -0.0067,  0.1577,  0.0460, -0.0038, -0.0213,  0.0124,
         -0.0231,  0.0776,  0.0298, -0.0416,  0.0581, -0.0165, -0.0609,  0.0870,
          0.0309, -0.0083, -0.0390, -0.0622,  0.0221, -0.0380,  0.0551, -0.0768]],
       device='cuda:0', grad_fn=<AddmmBackward>)