Skip to content

Commit

Permalink
Bug Fix and Test extensions
Browse files Browse the repository at this point in the history
  • Loading branch information
FBerding committed Mar 2, 2024
1 parent 2a0e6e7 commit 628be2d
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 41 deletions.
78 changes: 63 additions & 15 deletions R/aif_gui.R
Original file line number Diff line number Diff line change
Expand Up @@ -2257,23 +2257,39 @@ start_aifeducation_studio<-function(){
model<-transformers$TFAutoModel$from_pretrained(model_path)
model_architecture<-model$config$architectures
max_position_embeddings=model$config$max_position_embeddings
if(model_architecture=="FunnelForMaskedLM"){
max_layer=sum(model$config$block_repeats*model$config$block_sizes)
} else {
max_layer=model$config$num_hidden_layers
}
} else if(file.exists(paste0(model_path,
"/",
"pytorch_model.bin"))){
model<-transformers$AutoModel$from_pretrained(model_path)
model_architecture<-model$config$architectures
max_position_embeddings=model$config$max_position_embeddings
if(model_architecture=="FunnelForMaskedLM"){
max_layer=sum(model$config$block_repeats*model$config$block_sizes)
} else {
max_layer=model$config$num_hidden_layers
}
} else if(file.exists(paste0(model_path,
"/",
"model.safetensors"))){
model<-transformers$AutoModel$from_pretrained(model_path)
model_architecture<-model$config$architectures
max_position_embeddings=model$config$max_position_embeddings
if(model_architecture=="FunnelForMaskedLM"){
max_layer=sum(model$config$block_repeats*model$config$block_sizes)
} else {
max_layer=model$config$num_hidden_layers
}
} else {
model_architecture=NULL
max_position_embeddings=NULL
max_layer=NULL
}
return(list(model_architecture,max_position_embeddings))
return(list(model_architecture,max_position_embeddings,max_layer))
})

shiny::observe({
Expand All @@ -2290,6 +2306,16 @@ start_aifeducation_studio<-function(){

output$lm_interface_setting<-shiny::renderUI({
if(length(interface_architecture()[[2]])>0){

max_layer_transformer=interface_architecture()[[3]]
print(interface_architecture()[[1]])
if(interface_architecture()[[1]]=="FunnelForMaskedLM"|
interface_architecture()[[1]]=="FunnelModel"){
pool_type_choices=c("cls")
} else {
pool_type_choices=c("average","cls")
}

ui<-shinydashboard::box(title = "Interface Setting",
width = 12,
solidHeader = TRUE,
Expand Down Expand Up @@ -2324,13 +2350,19 @@ start_aifeducation_studio<-function(){
min = 0,
max= interface_architecture()[[2]],
step = 1),
shiny::selectInput(inputId = "lm_aggregation",
label = "Aggregation Hidden States",
choices = c("last",
"second_to_last",
"fourth_to_last",
"all",
"last_four"))
shiny::sliderInput(inputId = "lm_emb_layers",
label = "Layers for Embeddings",
value=c(
max(1,floor(0.5*max_layer_transformer)),
max(1,floor(2/3*max_layer_transformer))),
min=1,
max=max_layer_transformer,
step=1),
shiny::selectInput(inputId = "lm_emb_pool_type",
label=paste("Pooling Type"),
choices=pool_type_choices,
multiple=FALSE
),
)
)
)
Expand Down Expand Up @@ -2374,15 +2406,20 @@ start_aifeducation_studio<-function(){
shiny::observeEvent(input$lm_save_interface,{
model_architecture=interface_architecture()[1]
print(model_architecture)
if(model_architecture=="BertForMaskedLM"){
if(model_architecture=="BertForMaskedLM"|
model_architecture=="BertModel"){
method="bert"
} else if(model_architecture=="FunnelForMaskedLM"){
} else if(model_architecture=="FunnelForMaskedLM"|
model_architecture=="FunnelModel"){
method="funnel"
} else if(model_architecture=="LongformerForMaskedLM"){
} else if(model_architecture=="LongformerForMaskedLM"|
model_architecture=="LongformerModel"){
method="longformer"
} else if(model_architecture=="RobertaForMaskedLM"){
} else if(model_architecture=="RobertaForMaskedLM"|
model_architecture=="RobertaModel"){
method="roberta"
} else if(model_architecture=="DebertaV2ForMaskedLM"){
} else if(model_architecture=="DebertaV2ForMaskedLM"|
model_architecture=="DebertaV2Model"){
method="deberta_v2"
}

Expand All @@ -2407,7 +2444,9 @@ start_aifeducation_studio<-function(){
max_length = input$lm_max_length,
overlap = input$lm_overlap,
chunks = input$lm_chunks,
aggregation = input$lm_aggregation,
emb_layer_min=input$lm_emb_layers[1],
emb_layer_max=input$lm_emb_layers[2],
emb_pool_type=input$lm_emb_pool_type,
ml_framework = input$config_ml_framework,
model_dir = model_path_interface_LM(),
method = method)
Expand Down Expand Up @@ -2764,7 +2803,16 @@ start_aifeducation_studio<-function(){
shiny::tags$p("Token Overlap: ",model$get_transformer_components()$overlap),
shiny::tags$p("Max Tokens: ",(model$get_model_info()$model_max_size-model$get_transformer_components()$overlap)
*model$get_transformer_components()$chunks+model$get_model_info()$model_max_size),
shiny::tags$p("Hidden States Aggregation: ",model$get_transformer_components()$aggregation),
if(!is.null(model$get_transformer_components()$aggregation)){
shiny::tags$p("Hidden States Aggregation: ",model$get_transformer_components()$aggregation)
},
if(!is.null(model$get_transformer_components()$emb_pool_type)){
shiny::tags$div(
shiny::tags$p("Pool Type: ",model$get_transformer_components()$emb_pool_type),
shiny::tags$p("Embedding Layers - Min: ",model$get_transformer_components()$emb_layer_min),
shiny::tags$p("Embedding Layers - Max: ",model$get_transformer_components()$emb_layer_max)
)
},
shiny::tags$h3("Sustainability"),
if(methods::isClass(Class="data.frame",where = model$get_sustainability_data())){
if(is.na(model$get_sustainability_data()[1,1])==FALSE){
Expand Down
25 changes: 17 additions & 8 deletions R/text_embedding_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ TextEmbeddingModel<-R6::R6Class(
stop("ml_framework must be 'tensorflow' or 'pytorch'.")
}
}
if(method=="funnel" & emb_pool_type!="cls"){
stop("Funnel currently supports only cls as pooling type.")
}

#------------------------------------------------------------------------
private$r_package_versions$aifeducation<-packageVersion("aifeducation")
Expand Down Expand Up @@ -1218,6 +1221,10 @@ TextEmbeddingModel<-R6::R6Class(
private$transformer_components$chunks,
n_layer_size))

#Selecting the relevant layers
selected_layer=private$transformer_components$emb_layer_min:private$transformer_components$emb_layer_max
tmp_selected_layer=1+selected_layer

if(private$transformer_components$ml_framework=="tensorflow"){
#Clear session to ensure enough memory
tf$keras$backend$clear_session()
Expand All @@ -1232,7 +1239,7 @@ TextEmbeddingModel<-R6::R6Class(
output_hidden_states=TRUE)$hidden_states
if(private$transformer_components$emb_pool_type=="average"){
#Average Pooling over all tokens
for(i in 1:length(tensor_embeddings)){
for(i in tmp_selected_layer){
tensor_embeddings[i]=list(pooling(
inputs=tensor_embeddings[[as.integer(i)]],
mask=tokens$encodings["attention_mask"]))
Expand All @@ -1256,10 +1263,10 @@ TextEmbeddingModel<-R6::R6Class(
token_type_ids=tokens$encodings["token_type_ids"]$to(pytorch_device),
output_hidden_states=TRUE)$hidden_states
)
print(tensor_embeddings)
#print(tensor_embeddings)
if(private$transformer_components$emb_pool_type=="average"){
#Average Pooling over all tokens
for(i in 1:length(tensor_embeddings)){
for(i in tmp_selected_layer){
tensor_embeddings[i]=list(pooling(tensor_embeddings[[as.integer(i)]]))
}
}
Expand All @@ -1270,8 +1277,7 @@ TextEmbeddingModel<-R6::R6Class(
}


#Selecting the relevant layers
selected_layer=private$transformer_components$emb_layer_min:private$transformer_components$emb_layer_max


#if(private$transformer_components$aggregation=="last"){
# selected_layer=private$transformer_components$model$config$num_hidden_layers
Expand All @@ -1288,7 +1294,6 @@ TextEmbeddingModel<-R6::R6Class(
#Sorting the hidden states to the corresponding cases and times
#If more than one layer is selected the mean is calculated
index=0
tmp_selected_layer=1+selected_layer
for(i in 1:length(batch)){
for(j in 1:tokens$chunks[i]){
for(layer in tmp_selected_layer){
Expand Down Expand Up @@ -1694,10 +1699,14 @@ TextEmbeddingModel<-R6::R6Class(
get_transformer_components=function(){
return(
list(
aggregation=private$transformer_components$aggregation,
chunks=private$transformer_components$chunks,
overlap=private$transformer_components$overlap,
ml_framework=private$transformer_components$ml_framework
ml_framework=private$transformer_components$ml_framework,

emb_layer_min=private$transformer_components$emb_layer_min,
emb_layer_max=private$transformer_components$emb_layer_max,
emb_pool_type=private$transformer_components$emb_pool_type

)
)
},
Expand Down
2 changes: 1 addition & 1 deletion inst/python/pytorch_te_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def __init__(self,features, times, hidden, rec, intermediate_size,
layer_list.update({"rec_dropout_"+str(i+1):torch.nn.Dropout(p=rec_dropout)})

if n_rec>0 or repeat_encoder>0:
layer_list.update({"global_average_pooling":GlobalAveragePooling1D_PT(sequence_length=times)})
layer_list.update({"global_average_pooling":GlobalAveragePooling1D_PT()})

if(n_rec>0):
current_size_2=2*rec[len(rec)-1]
Expand Down
2 changes: 2 additions & 0 deletions tests/testthat/test-02_basic_text_rep.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@

testthat::skip()

path="test_data/gvc_lda/vocab_draft_movie_review.rda"
testthat::skip_if_not(condition=file.exists(testthat::test_path(path)),
message = "Necessary dataset not available")
Expand Down
39 changes: 29 additions & 10 deletions tests/testthat/test-04_transformer_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ transformers$utils$logging$set_verbosity_error()
os$environ$setdefault("TOKENIZERS_PARALLELISM","false")
set_config_tf_logger("ERROR")
set_config_os_environ_logger("ERROR")
transformers$utils$logging$disable_progress_bar()
transformers$logging$disable_progress_bar()

if(dir.exists(testthat::test_path("test_artefacts"))==FALSE){
dir.create(testthat::test_path("test_artefacts"))
Expand Down Expand Up @@ -374,7 +374,7 @@ for(ai_method in ai_methods){
sustain_interval = 15,
trace=FALSE,
keras_trace = 0))

Sys.sleep(2)
expect_no_error(
train_tune_bert_model(ml_framework = framework,
output_dir=testthat::test_path(paste0(path_01,"/",framework)),
Expand Down Expand Up @@ -461,6 +461,7 @@ for(ai_method in ai_methods){
sustain_interval = 15,
trace=FALSE,
keras_trace = 0))
Sys.sleep(2)
expect_no_error(
train_tune_funnel_model(ml_framework = framework,
output_dir=testthat::test_path(paste0(path_01,"/",framework)),
Expand Down Expand Up @@ -506,6 +507,7 @@ for(ai_method in ai_methods){
keras_trace = 0,
trace=FALSE))

Sys.sleep(2)
expect_no_error(
train_tune_deberta_v2_model(
ml_framework = framework,
Expand All @@ -532,11 +534,16 @@ for(ai_method in ai_methods){

#Embedding of the Model-------------------------------------------------------

pooling_types=c("cls","average")
if(ai_method=="funnel"){
pooling_types=c("cls")
} else {
pooling_types=c("cls","average")
}
max_layers=1:2

for(pooling_type in pooling_types){
for(max_layer in max_layers){
for(min_layer in 1:max_layer)
bert_modeling<-TextEmbeddingModel$new(
model_name=paste0(ai_method,"_embedding"),
model_label=paste0("Text Embedding via",ai_method),
Expand All @@ -547,17 +554,32 @@ for(ai_method in ai_methods){
max_length = 20,
chunks=4,
overlap=10,
emb_layer_min = 1,
emb_layer_min = min_layer,
emb_layer_max = max_layer,
emb_pool_type = pooling_type,
model_dir=testthat::test_path(paste0(path_01,"/",framework))
)

test_that(paste0(ai_method,"embedding",framework,"get_transformer_components"),{
expect_equal(bert_modeling$get_transformer_components()$emb_layer_min,min_layer)
expect_equal(bert_modeling$get_transformer_components()$emb_layer_max,max_layer)
expect_equal(bert_modeling$get_transformer_components()$emb_pool_type,pooling_type)
})

test_that(paste0(ai_method,"embedding",framework,"for loading"), {
embeddings<-bert_modeling$embed(raw_text = example_data$text[1:10],
doc_id = example_data$id[1:10])
expect_s3_class(embeddings, class="EmbeddedText")


perm=sample(x=1:10,size = 10,replace = FALSE)
embeddings_perm<-bert_modeling$embed(raw_text = example_data$text[perm],
doc_id = example_data$id[perm])
for(i in 1:10){
expect_equal(embeddings$embeddings[i,,],embeddings_perm$embeddings[which(perm==i),,])
#diff=sum(embeddings[i,,])-sum()
}

embeddings<-NULL
embeddings<-bert_modeling$embed(raw_text = example_data$text[1:1],
doc_id = example_data$id[1:1])
Expand All @@ -567,10 +589,6 @@ for(ai_method in ai_methods){
}
}


embeddings<-bert_modeling$embed(raw_text = "This is a test",
doc_id = "test_01")

model_name=bert_modeling$get_model_info()$model_name
model_name_root=bert_modeling$get_model_info()$model_name_root

Expand Down Expand Up @@ -605,6 +623,8 @@ for(ai_method in ai_methods){

})



test_that(paste0(ai_method,"encoding",framework), {
encodings<-bert_modeling$encode(raw_text = example_data$text[1:10],
token_encodings_only = TRUE,
Expand Down Expand Up @@ -917,8 +937,7 @@ for(ai_method in ai_methods){
)
)

tmp<-test$get_transformer_components()[[4]]

tmp<-test$get_transformer_components()[["ml_framework"]]
expect_equal(tmp,framework)

}
Expand Down
15 changes: 12 additions & 3 deletions vignettes/classification_tasks.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,9 @@ bert_modeling<-TextEmbeddingModel$new(
max_length = 512,
chunks=4,
overlap=30,
aggregation="last",
emb_layer_min="middle",
emb_layer_max="2_3_layer",
emb_pool_type="average",
model_dir="my_own_transformer_trained"
)
```
Expand Down Expand Up @@ -793,9 +795,16 @@ this example model can analyse a maximum of 512+(4-1)*(512-30)=1958 tokens of a
text.

Finally, you have to decide from which hidden layer or layers the embeddings
should be drawn (`aggregation="last"`). In their initial work, Devlin et al. (2019) used
should be drawn. With `emb_layer_min` and `emb_layer_max` you can decide over which layers
the average value for every token should be calculated. Please note that the calculation
considers all layers between `emb_layer_min` and `emb_layer_max`.
In their initial work, Devlin et al. (2019) used
the hidden states of different layers for classification.

With `emb_pool_type` you decide which tokens are used for pooling within every layer.
In the case of `emb_pool_type="cls"` only the cls token is used. In the case of
`emb_pool_type="average"` all tokens within a layer are averaged except padding tokens.

After deciding about the configuration, you can use your model.

> **Note:** With version 0.3.1 of aifeducation every transformer can be used with both
Expand Down Expand Up @@ -1011,7 +1020,7 @@ layering in order to take the context of all chunks into account. To add self-at
you have two choices:
- You can use the attention mechanism used in classic transformer models as
multihead attention (Vaswani et al. 2017). For this variant you have to set
`attention_type="multihead`, `repeat_encoder` to a value of at least 1, and
`attention_type="multihead"`, `repeat_encoder` to a value of at least 1, and
self_attention_heads`to a value of at least 1.
- Furthermore you can use the attention mechanism described in Lee-Thorp et al. (2021)
of the FNet model which allows much fast computations at low accuracy costs. To use
Expand Down
Loading

0 comments on commit 628be2d

Please sign in to comment.