Bug Fix and Test extensions

FBerding · Mar 2, 2024 · 628be2d · 628be2d
1 parent 2a0e6e7
commit 628be2d
Show file tree

Hide file tree

Showing 7 changed files with 136 additions and 41 deletions.
diff --git a/R/aif_gui.R b/R/aif_gui.R
@@ -2257,23 +2257,39 @@ start_aifeducation_studio<-function(){
         model<-transformers$TFAutoModel$from_pretrained(model_path)
         model_architecture<-model$config$architectures
         max_position_embeddings=model$config$max_position_embeddings
+        if(model_architecture=="FunnelForMaskedLM"){
+          max_layer=sum(model$config$block_repeats*model$config$block_sizes)
+        } else {
+          max_layer=model$config$num_hidden_layers
+        }
       } else if(file.exists(paste0(model_path,
                                    "/",
                                    "pytorch_model.bin"))){
         model<-transformers$AutoModel$from_pretrained(model_path)
         model_architecture<-model$config$architectures
         max_position_embeddings=model$config$max_position_embeddings
+        if(model_architecture=="FunnelForMaskedLM"){
+          max_layer=sum(model$config$block_repeats*model$config$block_sizes)
+        } else {
+          max_layer=model$config$num_hidden_layers
+        }
       } else if(file.exists(paste0(model_path,
                                   "/",
                                   "model.safetensors"))){
         model<-transformers$AutoModel$from_pretrained(model_path)
         model_architecture<-model$config$architectures
         max_position_embeddings=model$config$max_position_embeddings
+        if(model_architecture=="FunnelForMaskedLM"){
+          max_layer=sum(model$config$block_repeats*model$config$block_sizes)
+        } else {
+          max_layer=model$config$num_hidden_layers
+        }
       } else {
         model_architecture=NULL
         max_position_embeddings=NULL
+        max_layer=NULL
       }
-      return(list(model_architecture,max_position_embeddings))
+      return(list(model_architecture,max_position_embeddings,max_layer))
     })
 
     shiny::observe({
@@ -2290,6 +2306,16 @@ start_aifeducation_studio<-function(){
 
     output$lm_interface_setting<-shiny::renderUI({
       if(length(interface_architecture()[[2]])>0){
+
+        max_layer_transformer=interface_architecture()[[3]]
+        print(interface_architecture()[[1]])
+        if(interface_architecture()[[1]]=="FunnelForMaskedLM"|
+           interface_architecture()[[1]]=="FunnelModel"){
+          pool_type_choices=c("cls")
+        } else {
+          pool_type_choices=c("average","cls")
+        }
+
         ui<-shinydashboard::box(title = "Interface Setting",
                 width = 12,
                 solidHeader = TRUE,
@@ -2324,13 +2350,19 @@ start_aifeducation_studio<-function(){
                                      min = 0,
                                      max= interface_architecture()[[2]],
                                      step = 1),
-                         shiny::selectInput(inputId = "lm_aggregation",
-                                     label = "Aggregation Hidden States",
-                                     choices = c("last",
-                                                 "second_to_last",
-                                                 "fourth_to_last",
-                                                 "all",
-                                                 "last_four"))
+                         shiny::sliderInput(inputId = "lm_emb_layers",
+                                     label = "Layers for Embeddings",
+                                     value=c(
+                                       max(1,floor(0.5*max_layer_transformer)),
+                                       max(1,floor(2/3*max_layer_transformer))),
+                                     min=1,
+                                     max=max_layer_transformer,
+                                     step=1),
+                         shiny::selectInput(inputId = "lm_emb_pool_type",
+                                            label=paste("Pooling Type"),
+                                            choices=pool_type_choices,
+                                            multiple=FALSE
+                                            ),
                   )
                 )
         )
@@ -2374,15 +2406,20 @@ start_aifeducation_studio<-function(){
     shiny::observeEvent(input$lm_save_interface,{
       model_architecture=interface_architecture()[1]
       print(model_architecture)
-      if(model_architecture=="BertForMaskedLM"){
+      if(model_architecture=="BertForMaskedLM"|
+         model_architecture=="BertModel"){
         method="bert"
-      } else if(model_architecture=="FunnelForMaskedLM"){
+      } else if(model_architecture=="FunnelForMaskedLM"|
+                model_architecture=="FunnelModel"){
         method="funnel"
-      } else if(model_architecture=="LongformerForMaskedLM"){
+      } else if(model_architecture=="LongformerForMaskedLM"|
+                model_architecture=="LongformerModel"){
         method="longformer"
-      } else if(model_architecture=="RobertaForMaskedLM"){
+      } else if(model_architecture=="RobertaForMaskedLM"|
+                model_architecture=="RobertaModel"){
         method="roberta"
-      } else if(model_architecture=="DebertaV2ForMaskedLM"){
+      } else if(model_architecture=="DebertaV2ForMaskedLM"|
+                model_architecture=="DebertaV2Model"){
         method="deberta_v2"
       }
 
@@ -2407,7 +2444,9 @@ start_aifeducation_studio<-function(){
           max_length = input$lm_max_length,
           overlap = input$lm_overlap,
           chunks = input$lm_chunks,
-          aggregation = input$lm_aggregation,
+          emb_layer_min=input$lm_emb_layers[1],
+          emb_layer_max=input$lm_emb_layers[2],
+          emb_pool_type=input$lm_emb_pool_type,
           ml_framework = input$config_ml_framework,
           model_dir = model_path_interface_LM(),
           method = method)
@@ -2764,7 +2803,16 @@ start_aifeducation_studio<-function(){
                                      shiny::tags$p("Token Overlap: ",model$get_transformer_components()$overlap),
                                      shiny::tags$p("Max Tokens: ",(model$get_model_info()$model_max_size-model$get_transformer_components()$overlap)
                                             *model$get_transformer_components()$chunks+model$get_model_info()$model_max_size),
-                                     shiny::tags$p("Hidden States Aggregation: ",model$get_transformer_components()$aggregation),
+                                     if(!is.null(model$get_transformer_components()$aggregation)){
+                                       shiny::tags$p("Hidden States Aggregation: ",model$get_transformer_components()$aggregation)
+                                     },
+                                     if(!is.null(model$get_transformer_components()$emb_pool_type)){
+                                       shiny::tags$div(
+                                         shiny::tags$p("Pool Type: ",model$get_transformer_components()$emb_pool_type),
+                                         shiny::tags$p("Embedding Layers - Min: ",model$get_transformer_components()$emb_layer_min),
+                                         shiny::tags$p("Embedding Layers - Max: ",model$get_transformer_components()$emb_layer_max)
+                                       )
+                                     },
                                      shiny::tags$h3("Sustainability"),
                                      if(methods::isClass(Class="data.frame",where = model$get_sustainability_data())){
                                        if(is.na(model$get_sustainability_data()[1,1])==FALSE){

diff --git a/R/text_embedding_model.R b/R/text_embedding_model.R
@@ -221,6 +221,9 @@ TextEmbeddingModel<-R6::R6Class(
           stop("ml_framework must be 'tensorflow' or 'pytorch'.")
         }
       }
+      if(method=="funnel" & emb_pool_type!="cls"){
+        stop("Funnel currently supports only cls as pooling type.")
+      }
 
       #------------------------------------------------------------------------
       private$r_package_versions$aifeducation<-packageVersion("aifeducation")
@@ -1218,6 +1221,10 @@ TextEmbeddingModel<-R6::R6Class(
                     private$transformer_components$chunks,
                     n_layer_size))
 
+          #Selecting the relevant layers
+          selected_layer=private$transformer_components$emb_layer_min:private$transformer_components$emb_layer_max
+          tmp_selected_layer=1+selected_layer
+
           if(private$transformer_components$ml_framework=="tensorflow"){
             #Clear session to ensure enough memory
             tf$keras$backend$clear_session()
@@ -1232,7 +1239,7 @@ TextEmbeddingModel<-R6::R6Class(
               output_hidden_states=TRUE)$hidden_states
             if(private$transformer_components$emb_pool_type=="average"){
               #Average Pooling over all tokens
-              for(i in 1:length(tensor_embeddings)){
+              for(i in tmp_selected_layer){
                 tensor_embeddings[i]=list(pooling(
                   inputs=tensor_embeddings[[as.integer(i)]],
                   mask=tokens$encodings["attention_mask"]))
@@ -1256,10 +1263,10 @@ TextEmbeddingModel<-R6::R6Class(
               token_type_ids=tokens$encodings["token_type_ids"]$to(pytorch_device),
               output_hidden_states=TRUE)$hidden_states
             )
-            print(tensor_embeddings)
+            #print(tensor_embeddings)
             if(private$transformer_components$emb_pool_type=="average"){
               #Average Pooling over all tokens
-              for(i in 1:length(tensor_embeddings)){
+              for(i in tmp_selected_layer){
                 tensor_embeddings[i]=list(pooling(tensor_embeddings[[as.integer(i)]]))
               }
             }
@@ -1270,8 +1277,7 @@ TextEmbeddingModel<-R6::R6Class(
           }
 
 
-          #Selecting the relevant layers
-          selected_layer=private$transformer_components$emb_layer_min:private$transformer_components$emb_layer_max
+
 
           #if(private$transformer_components$aggregation=="last"){
           #  selected_layer=private$transformer_components$model$config$num_hidden_layers
@@ -1288,7 +1294,6 @@ TextEmbeddingModel<-R6::R6Class(
           #Sorting the hidden states to the corresponding cases and times
           #If more than one layer is selected the mean is calculated
           index=0
-          tmp_selected_layer=1+selected_layer
           for(i in 1:length(batch)){
             for(j in 1:tokens$chunks[i]){
               for(layer in tmp_selected_layer){
@@ -1694,10 +1699,14 @@ TextEmbeddingModel<-R6::R6Class(
     get_transformer_components=function(){
       return(
         list(
-          aggregation=private$transformer_components$aggregation,
           chunks=private$transformer_components$chunks,
           overlap=private$transformer_components$overlap,
-          ml_framework=private$transformer_components$ml_framework
+          ml_framework=private$transformer_components$ml_framework,
+
+          emb_layer_min=private$transformer_components$emb_layer_min,
+          emb_layer_max=private$transformer_components$emb_layer_max,
+          emb_pool_type=private$transformer_components$emb_pool_type
+
         )
       )
     },

diff --git a/inst/python/pytorch_te_classifier.py b/inst/python/pytorch_te_classifier.py
@@ -234,7 +234,7 @@ def __init__(self,features, times, hidden, rec, intermediate_size,
           layer_list.update({"rec_dropout_"+str(i+1):torch.nn.Dropout(p=rec_dropout)})
 
     if n_rec>0 or repeat_encoder>0:
-      layer_list.update({"global_average_pooling":GlobalAveragePooling1D_PT(sequence_length=times)})
+      layer_list.update({"global_average_pooling":GlobalAveragePooling1D_PT()})
 
     if(n_rec>0):
       current_size_2=2*rec[len(rec)-1]

diff --git a/tests/testthat/test-02_basic_text_rep.R b/tests/testthat/test-02_basic_text_rep.R
@@ -1,4 +1,6 @@
 
+testthat::skip()
+
 path="test_data/gvc_lda/vocab_draft_movie_review.rda"
 testthat::skip_if_not(condition=file.exists(testthat::test_path(path)),
                       message  = "Necessary dataset not available")

diff --git a/tests/testthat/test-04_transformer_models.R b/tests/testthat/test-04_transformer_models.R
@@ -12,7 +12,7 @@ transformers$utils$logging$set_verbosity_error()
 os$environ$setdefault("TOKENIZERS_PARALLELISM","false")
 set_config_tf_logger("ERROR")
 set_config_os_environ_logger("ERROR")
-transformers$utils$logging$disable_progress_bar()
+transformers$logging$disable_progress_bar()
 
 if(dir.exists(testthat::test_path("test_artefacts"))==FALSE){
   dir.create(testthat::test_path("test_artefacts"))
@@ -374,7 +374,7 @@ for(ai_method in ai_methods){
                                 sustain_interval = 15,
                                 trace=FALSE,
                                 keras_trace = 0))
-
+          Sys.sleep(2)
         expect_no_error(
           train_tune_bert_model(ml_framework = framework,
                                 output_dir=testthat::test_path(paste0(path_01,"/",framework)),
@@ -461,6 +461,7 @@ for(ai_method in ai_methods){
                                   sustain_interval = 15,
                                   trace=FALSE,
                                   keras_trace = 0))
+          Sys.sleep(2)
           expect_no_error(
             train_tune_funnel_model(ml_framework = framework,
                                     output_dir=testthat::test_path(paste0(path_01,"/",framework)),
@@ -506,6 +507,7 @@ for(ai_method in ai_methods){
               keras_trace = 0,
               trace=FALSE))
 
+          Sys.sleep(2)
           expect_no_error(
             train_tune_deberta_v2_model(
               ml_framework = framework,
@@ -532,11 +534,16 @@ for(ai_method in ai_methods){
 
       #Embedding of the Model-------------------------------------------------------
 
-      pooling_types=c("cls","average")
+      if(ai_method=="funnel"){
+        pooling_types=c("cls")
+      } else {
+        pooling_types=c("cls","average")
+      }
       max_layers=1:2
 
       for(pooling_type in pooling_types){
         for(max_layer in max_layers){
+          for(min_layer in 1:max_layer)
           bert_modeling<-TextEmbeddingModel$new(
             model_name=paste0(ai_method,"_embedding"),
             model_label=paste0("Text Embedding via",ai_method),
@@ -547,17 +554,32 @@ for(ai_method in ai_methods){
             max_length = 20,
             chunks=4,
             overlap=10,
-            emb_layer_min = 1,
+            emb_layer_min = min_layer,
             emb_layer_max = max_layer,
             emb_pool_type = pooling_type,
             model_dir=testthat::test_path(paste0(path_01,"/",framework))
           )
 
+          test_that(paste0(ai_method,"embedding",framework,"get_transformer_components"),{
+            expect_equal(bert_modeling$get_transformer_components()$emb_layer_min,min_layer)
+            expect_equal(bert_modeling$get_transformer_components()$emb_layer_max,max_layer)
+            expect_equal(bert_modeling$get_transformer_components()$emb_pool_type,pooling_type)
+          })
+
           test_that(paste0(ai_method,"embedding",framework,"for loading"), {
             embeddings<-bert_modeling$embed(raw_text = example_data$text[1:10],
                                             doc_id = example_data$id[1:10])
             expect_s3_class(embeddings, class="EmbeddedText")
 
+
+            perm=sample(x=1:10,size = 10,replace = FALSE)
+            embeddings_perm<-bert_modeling$embed(raw_text = example_data$text[perm],
+                                                 doc_id = example_data$id[perm])
+            for(i in 1:10){
+              expect_equal(embeddings$embeddings[i,,],embeddings_perm$embeddings[which(perm==i),,])
+              #diff=sum(embeddings[i,,])-sum()
+            }
+
             embeddings<-NULL
             embeddings<-bert_modeling$embed(raw_text = example_data$text[1:1],
                                             doc_id = example_data$id[1:1])
@@ -567,10 +589,6 @@ for(ai_method in ai_methods){
         }
       }
 
-
-      embeddings<-bert_modeling$embed(raw_text = "This is a test",
-                                      doc_id = "test_01")
-
       model_name=bert_modeling$get_model_info()$model_name
       model_name_root=bert_modeling$get_model_info()$model_name_root
 
@@ -605,6 +623,8 @@ for(ai_method in ai_methods){
 
       })
 
+
+
       test_that(paste0(ai_method,"encoding",framework), {
         encodings<-bert_modeling$encode(raw_text = example_data$text[1:10],
                                         token_encodings_only = TRUE,
@@ -917,8 +937,7 @@ for(ai_method in ai_methods){
         )
       )
 
-      tmp<-test$get_transformer_components()[[4]]
-
+      tmp<-test$get_transformer_components()[["ml_framework"]]
       expect_equal(tmp,framework)
 
     }

diff --git a/vignettes/classification_tasks.Rmd b/vignettes/classification_tasks.Rmd
@@ -751,7 +751,9 @@ bert_modeling<-TextEmbeddingModel$new(
   max_length = 512,
   chunks=4,
   overlap=30,
-  aggregation="last",
+  emb_layer_min="middle",
+  emb_layer_max="2_3_layer",
+  emb_pool_type="average",
   model_dir="my_own_transformer_trained"
   )
 ```
@@ -793,9 +795,16 @@ this example model can analyse a maximum of 512+(4-1)*(512-30)=1958 tokens of a
 text.
 
 Finally, you have to decide from which hidden layer or layers the embeddings 
-should be drawn (`aggregation="last"`). In their initial work, Devlin et al. (2019) used
+should be drawn. With `emb_layer_min` and `emb_layer_max` you can decide over which layers
+the average value for every token should be calculated. Please note that the calculation
+considers all layers between `emb_layer_min` and `emb_layer_max`. 
+In their initial work, Devlin et al. (2019) used
 the hidden states of different layers for classification.
 
+With `emb_pool_type` you decide which tokens are used for pooling within every layer. 
+In the case of `emb_pool_type="cls"` only the cls token is used. In the case of
+`emb_pool_type="average"` all tokens within a layer are averaged except padding tokens.
+
 After deciding about the configuration, you can use your model.
 
 > **Note:** With version 0.3.1 of aifeducation every transformer can be used with both
@@ -1011,7 +1020,7 @@ layering in order to take the context of all chunks into account. To add self-at
 you have two choices:
 - You can use the attention mechanism used in classic transformer models as 
 multihead attention (Vaswani et al. 2017). For this variant you have to set
-`attention_type="multihead`, `repeat_encoder` to a value of at least 1, and
+`attention_type="multihead"`, `repeat_encoder` to a value of at least 1, and
 self_attention_heads`to a value of at least 1. 
 - Furthermore you can use the attention mechanism described in Lee-Thorp et al. (2021) 
 of the FNet model which allows much fast computations at low accuracy costs. To use