In [59]:
from flair.embeddings import TransformerDocumentEmbeddings
from transformers import AutoModel
import torch
from flair.embeddings import StackedEmbeddings
from flair.embeddings import TokenEmbeddings
from flair.data import Sentence
from flair.datasets import CSVClassificationCorpus
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from transformers import AutoTokenizer

In [69]:
# init embedding
#embedding = TransformerDocumentEmbeddings("allenai/scibert_scivocab_uncased")

Downloading vocab.txt: 100%|██████████| 223k/223k [00:00<00:00, 620kB/s] 


[Sentence: "The grass is green ."]

In [71]:
from flair.embeddings import SentenceTransformerDocumentEmbeddings
# source: https://huggingface.co/pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb
# init embedding
embedding = SentenceTransformerDocumentEmbeddings("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

Downloading: 100%|██████████| 1.43k/1.43k [00:00<00:00, 989kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 142kB/s]
Downloading: 100%|██████████| 4.45k/4.45k [00:00<00:00, 3.09MB/s]
Downloading: 100%|██████████| 691/691 [00:00<00:00, 500kB/s]
Downloading: 100%|██████████| 124/124 [00:00<00:00, 92.1kB/s]
Downloading: 100%|██████████| 767/767 [00:00<00:00, 535kB/s]
Downloading: 100%|██████████| 433M/433M [00:06<00:00, 70.1MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 37.1kB/s]
Downloading: 100%|██████████| 300/300 [00:00<00:00, 217kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 83.4kB/s]
Downloading: 100%|██████████| 669k/669k [00:00<00:00, 1.43MB/s]
Downloading: 100%|██████████| 412/412 [00:00<00:00, 308kB/s]
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 576kB/s] 
Downloading: 100%|██████████| 229/229 [00:00<00:00, 165kB/s]


In [7]:
# source: https://huggingface.co/gsarti/scibert-nli
sci_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [66]:
class HuggingFaceEmbedding(torch.nn.Module):
    def __init__(self, model, name):
        super().__init__()
        self.model = model
        self.name = name
        self.embedding_type = "hugging_face_transformer"
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(name)
        self.embedding_length = model.config.hidden_size
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        return outputs[0]
    
    def get_sentence_embedding(self, sentence):
        input_ids = torch.tensor(self.tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0) 
        with torch.no_grad():
            outputs = self.model(input_ids)
        return outputs[0][0, 0, :]

bio_model = AutoModel.from_pretrained("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
hf_embedding = HuggingFaceEmbedding(model = bio_model, name = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
bio_embedding = StackedEmbeddings([hf_embedding])

In [65]:
# Get hugging face models and convert into flair models
class HuggingFaceEmbedding(torch.nn.Module):
    def __init__(self, model, name):
        super().__init__()
        self.model = model
        self.name = name
        self.embedding_type = "hugging_face_transformer"
        self.model.eval()
        self.embedding_length = model.config.hidden_size
        self.tokenizer = AutoTokenizer.from_pretrained(name)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        return outputs[0]
    
    def flair_embedding(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        return outputs[2][:, 0, :]

bio_model = AutoModel.from_pretrained("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
hf_embedding = HuggingFaceEmbedding(model = bio_model, name = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
bio_embedding = StackedEmbeddings([hf_embedding])
#sci_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
#hf1_embedding = HuggingFaceEmbedding(model = sci_model, name = "SciBertEmbedding")
#sci_embedding = StackedEmbeddings([hf1_embedding])

In [54]:
# combine the two sentence embeddings
combined = [bio_embedding, sci_embedding]
stack = StackedEmbeddings(combined)

In [55]:
# Load tagged corpus
# this is the folder in which train, test and dev files reside
data_folder = r'/students/2022-2023/master/spacey/chiara/Meta-knowledge_GENIA_corpus/corpus/'
#print(data_folder)
column_name_map = {0: "text", 1:"label"}  
label_type = "label" 

corpus = CSVClassificationCorpus(data_folder, 
                                        column_name_map, 
                                        skip_header=True, 
                                        delimiter="|",
                                        train_file= 'train.csv',
                                        dev_file= 'dev.csv',
                                        test_file= 'test.csv', 
                                        label_type=label_type)

print(corpus)

2023-02-04 19:10:37,680 Reading data from /students/2022-2023/master/spacey/chiara/Meta-knowledge_GENIA_corpus/corpus
2023-02-04 19:10:37,681 Train: /students/2022-2023/master/spacey/chiara/Meta-knowledge_GENIA_corpus/corpus/train.csv
2023-02-04 19:10:37,682 Dev: /students/2022-2023/master/spacey/chiara/Meta-knowledge_GENIA_corpus/corpus/dev.csv
2023-02-04 19:10:37,682 Test: /students/2022-2023/master/spacey/chiara/Meta-knowledge_GENIA_corpus/corpus/test.csv
Corpus: 2374 train + 607 dev + 607 test sentences


In [56]:
label_dict = corpus.make_label_dictionary(label_type=label_type)

2023-02-04 19:10:41,504 Computing label dictionary. Progress:


2374it [00:01, 2057.71it/s]

2023-02-04 19:10:42,662 Dictionary created for label 'label' with 4 values: L2 (seen 1068 times), L3 (seen 1000 times), L1 (seen 306 times)





In [67]:
classifier = TextClassifier(bio_embedding, label_dictionary = label_dict, label_type = label_type)

In [68]:
trainer = ModelTrainer(classifier, corpus)

trainer.train('classifier', learning_rate=0.1, mini_batch_size = 32, max_epochs = 4)

2023-02-04 19:24:03,708 ----------------------------------------------------------------------------------------------------
2023-02-04 19:24:03,711 Model: "TextClassifier(
  (decoder): Linear(in_features=768, out_features=4, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (locked_dropout): LockedDropout(p=0.0)
  (word_dropout): WordDropout(p=0.0)
  (loss_function): CrossEntropyLoss()
  (document_embeddings): StackedEmbeddings(
    (list_embedding_0): HuggingFaceEmbedding(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(28996, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0): BertLayer(
              (attention): BertAttention(
       

AttributeError: 'HuggingFaceEmbedding' object has no attribute 'embed'