In [32]:
%load_ext autoreload
%autoreload 2

import json
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import pytorch_cos_sim

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
filepath = "../../third_party_data/train.tsv"
df = pd.read_csv(filepath, sep="\t")

df_trunc = df.iloc[0:10, :]
all_stt = []
for fname in df.path.values:
    with open(f"../../third_party_data/cv_nl_stt/{fname}.json", "r+") as f:
        stt_out = json.load(f)
        all_stt.append(stt_out['results']['channels'][0]['alternatives'][0]['transcript'])
df_trunc['stt_out'] = pd.Series(all_stt)
df_trunc.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trunc['stt_out'] = pd.Series(all_stt)


Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment,stt_out
0,da4b6d09a23e8a83f83fec4e302a82c500d2821c4bb4d4...,common_voice_nl_30382934.mp3,"Een daadwerkelijke keuzevrijheid voor ouderen,...",2,0,,,Nederlands Nederlands,nl,,een daadwerkelijke keuzevrijheid voor ouderen ...
1,da4b6d09a23e8a83f83fec4e302a82c500d2821c4bb4d4...,common_voice_nl_30382935.mp3,Elke kandidaat-lidstaat moet op zijn eigen mer...,2,0,,,Nederlands Nederlands,nl,,elke kandidaat dit staat moet op zijn eigen wo...
2,da4b6d09a23e8a83f83fec4e302a82c500d2821c4bb4d4...,common_voice_nl_30382936.mp3,Het verslag legt sterke nadruk op het nauwe ve...,2,0,,,Nederlands Nederlands,nl,,het verslag legt sterke nadruk op het nauwe ve...
3,da4b6d09a23e8a83f83fec4e302a82c500d2821c4bb4d4...,common_voice_nl_30382937.mp3,Wij openen nu het algemeen debat.,4,0,,,Nederlands Nederlands,nl,,we openen nu het algemeen debat
4,da4b6d09a23e8a83f83fec4e302a82c500d2821c4bb4d4...,common_voice_nl_30382938.mp3,Die fase is gebaseerd op de testcyclus van per...,4,0,,,Nederlands Nederlands,nl,,die fase is gebaseerd op de test van personena...


## Encode sentences using sentence transformers

In [10]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [25]:
# Using a multi-linual model compatible with sentence transformers
model1 = SentenceTransformer("../models/paraphrase-MiniLM-L6-v2", device=device)

In [26]:
for i, sent in enumerate(df_trunc.sentence):
    embed_sent = model1.encode(sent)
    embed_stt = model1.encode(df_trunc.stt_out.iloc[i])
    print(pytorch_cos_sim(embed_sent, embed_stt)[0].cpu())
    break

tensor([0.9802])


## Encode sentences using Transformers

In [19]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def get_embeddings(sent, tokenizer, model, device='cpu'):
    #Tokenize sentence
    encoded_input = tokenizer(sent, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings

In [20]:
#Load AutoModel from huggingface model repository OR locally cached
tokenizer = AutoTokenizer.from_pretrained("../models/robbert")
model2 = AutoModelForMaskedLM.from_pretrained("../models/robbert")
model2 = model2.to(device)

In [21]:
for i, sent in enumerate(df_trunc.sentence):
    embed_rob_sent = get_embeddings(sent, tokenizer, model2, device)
    embed_rob_stt = get_embeddings(sent, tokenizer, model2, device)
    print(pytorch_cos_sim(embed_rob_sent, embed_rob_stt)[0].cpu())
    break


tensor([1.0000])


## Text Generation / Seq2Seq Inferencer
 
Generating text with a prompt. The same setup can be extended to a seq2seq setup for training. Some relevant links are as follows:\
- [MBart](https://huggingface.co/docs/transformers/main/en/multilingual#mbart)
- [Seq2Seq primer: Hugging Face](https://huggingface.co/docs/transformers/main/en/glossary#general-terms)
- [Text Generation](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)

In [61]:
# modelname = "../models/robbert/"
# modelname = "../models/paraphrase-MiniLM-L6-v2/"
modelname = "../models/bert-base-multilingual-uncased/"

In [62]:
gen_tokenizer = AutoTokenizer.from_pretrained(f"{modelname}")
gen_model = AutoModelForMaskedLM.from_pretrained(f"{modelname}", is_decoder=True)
gen_model = gen_model.to(device)

If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.
Some weights of the model checkpoint at ../models/bert-base-multilingual-uncased/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [65]:
prompt = "Het verslag legt nadruk op het"
input_ids = gen_tokenizer(prompt, return_tensors="pt").input_ids.to(device)

In [67]:
torch.manual_seed(0)
outputs = gen_model.generate(input_ids, do_sample=True, max_length=30)
gen_tokenizer.batch_decode(outputs, skip_special_tokens=True)

['het verslag legt nadruk op het. toespraak was voor, en niet niet de grond opspraakje, die niet bij de tweede']

In [None]:
'''
The text generation isn't that great but the idea is that given a pretrained LM we can use it out of the box for fine tuning on our desired use case. I foresee following approach:

1. Seq2Seq Approach:
Using a pre-trained encoder (such as any Roberta Based moodel for Dutch) and a pre-trained decoder (any GPT 2 based model for Dutch) set up the problem statement of generating tgt seq (manual annotations) given src seq (stt output)
    1.1 Maybe the tutorial here (https://huggingface.co/docs/transformers/tasks/translation) helps.
'''