Merge pull request #4 from FBerding/version_0_3_2_dev

Version 0 3 2
FBerding · Mar 17, 2024 · c749c48 · c749c48
2 parents 60e55d3 + f81e8d3
commit c749c48
Show file tree

Hide file tree

Showing 129 changed files with 1,480 additions and 299 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -28,5 +28,6 @@ cran-comments.md
 ^doc$
 ^Meta$
 gui_aife_studio.Rmd
+classification_tasks.Rmd
 intel_power_gadget_log.csv
 reference
diff --git a/.Rhistory b/.Rhistory
@@ -1,4 +1,3 @@
-#Callback-------------------------------------------------------------------
 if use_callback==True:
 if bacc_val>best_bacc:
 if trace>=1:
@@ -510,3 +509,4 @@ install.packages(c("BH", "brew", "brio", "cli", "coda", "commonmark", "curl", "d
 install.packages(c("BH", "brew", "brio", "cli", "coda", "commonmark", "curl", "data.table", "DBI", "desc", "digest", "DT", "e1071", "fansi", "float", "FNN", "glue", "htmlwidgets", "httpuv", "igraph", "ISOcodes", "later", "LiblineaR", "listenv", "lpSolve", "markdown", "Matrix", "MatrixExtra", "mgcv", "MplusAutomation", "pak", "parallelly", "pkgbuild", "pkgload", "processx", "progress", "ps", "psych", "ragg", "Rcpp", "RcppArmadillo", "RCurl", "readODS", "readr", "reticulate", "rJava", "roxygen2", "s2", "sass", "sf", "shinyWidgets", "stringi", "survival", "tensorflow", "tfruns", "tidyr", "topicmodels", "vroom", "withr", "xfun", "yaml", "zip"))
 install.packages("brew")
 install.packages(c("BH",  "brio", "cli", "coda", "commonmark", "curl", "data.table", "DBI", "desc", "digest", "DT", "e1071", "fansi", "float", "FNN", "glue", "htmlwidgets", "httpuv", "igraph", "ISOcodes", "later", "LiblineaR", "listenv", "lpSolve", "markdown", "Matrix", "MatrixExtra", "mgcv", "MplusAutomation", "pak", "parallelly", "pkgbuild", "pkgload", "processx", "progress", "ps", "psych", "ragg", "Rcpp", "RcppArmadillo", "RCurl", "readODS", "readr", "reticulate", "rJava", "roxygen2", "s2", "sass", "sf", "shinyWidgets", "stringi", "survival", "tensorflow", "tfruns", "tidyr", "topicmodels", "vroom", "withr", "xfun", "yaml", "zip"))
+devtools::test()
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -2,9 +2,9 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, master, version_0_3_1_dev]
+    branches: [main, master, version_0_3_2_dev]
   pull_request:
-    branches: [main, master, version_0_3_1_dev]
+    branches: [main, master, version_0_3_2_dev]
 
 name: R-CMD-check
 
@@ -66,11 +66,10 @@ jobs:
 
           conda_install(
             packages = c(
-            "tensorflow-cpu",
+            "tensorflow-cpu<=2.15",
             "torch",
             "torcheval",
             "safetensors",
-            "keras",
             "accelerate"),
             envname = envname,
             conda = "auto",

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: aifeducation
 Title: Artificial Intelligence for Education
-Version: 0.3.1
+Version: 0.3.2
 Authors@R: c(
     person("Berding", "Florian", , "florian.berding@uni-hamburg.de", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-3593-1695")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ export(bow_pp_create_basic_text_rep)
 export(bow_pp_create_vocab_draft)
 export(calc_standard_classification_measures)
 export(check_aif_py_modules)
+export(clean_pytorch_log_transformers)
 export(combine_embeddings)
 export(create_bert_model)
 export(create_deberta_v2_model)
@@ -20,6 +21,7 @@ export(get_coder_metrics)
 export(get_n_chunks)
 export(get_synthetic_cases)
 export(install_py_modules)
+export(is.null_or_na)
 export(load_ai_model)
 export(matrix_to_array_c)
 export(save_ai_model)

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,46 @@ editor_options:
     wrap: 72
 ---
 
+# aifeducation 0.3.2
+
+**TextEmbeddingClassifiers**
+
+-   Fixed a bug in GlobalAveragePooling1D_PT. Now the layer makes a correct pooling.
+    **This change has an effect on PyTorch models trained with version 0.3.1.**
+
+**TextEmbeddingModel**
+
+- Replaced the parameter 'aggregation' with three new parameters allowing to explicitly
+  choose the start and end layer to be included in the creation of embeddings. Furthermore,
+  two options for the pooling method within each layer is added ("cls" and "average").
+- Added support for reporting the training and validation loss during training 
+  the corresponding base model.
+
+**Transformer Models**
+
+-  Fixed a bug in the creation of all transformer models except funnel. Now choosing the
+   number of layers is working.
+-  A file 'history.log' is now saved within the model's folder reporting the loss
+   and validation loss during training for each epoch.
+
+**EmbeddedText**
+
+-   Changed the process for validating if EmbeddedTexts are compatible. Now only
+    the model's unique name is used for the validation.
+-   Added new fields and updated methods to account for the new options in creating embeddings (layer
+    selection and pooling type).
+
+**Graphical User Interface Aifeducation Studio**
+
+- Adapted the interface according to the changes made in this version.
+- Improved the read of raw texts. Reading now reduces multiple spaces characters to 
+  one single space character. Hyphenation is removed.
+
+**Python Installation** 
+
+- Updated installation to account for the new version of keras.
+
+
 # aifeducation 0.3.1
 
 **Graphical User Interface Aifeducation Studio**
@@ -63,7 +103,7 @@ editor_options:
 -   Added an argument to 'install_py_modules',
     allowing to choose which machine learning framework should be
     installed. 
-- Updated 'check_aif_py_modules'.
+-   Updated 'check_aif_py_modules'.
 
 **Further Changes**
 

diff --git a/R/aif_gui.R b/R/aif_gui.R
@@ -1186,6 +1186,8 @@ start_aifeducation_studio<-function(){
                               total=n_files,
                               title = as.character(all_paths[i]))
             tmp_document=readtext::readtext(file=all_paths[i])
+            tmp_document$text=stringr::str_replace_all(tmp_document$text,pattern = "[:space:]{1,}",replacement = " ")
+            tmp_document$text=stringr::str_replace_all(tmp_document$text,pattern = "-(?=[:space:])",replacement = "")
             #File name without extension
             #text_corpus[counter,"id"]=stringi::stri_split_fixed(tmp_document$doc_id,pattern=".")[[1]][1]
             tmp_string=stringr::str_split_fixed(tmp_document$doc_id,pattern="\\.",n=Inf)
@@ -1751,12 +1753,16 @@ start_aifeducation_studio<-function(){
     })
 
     train_tune_model_architecture<-shiny::eventReactive(model_path_train_LM(),{
-      shinyWidgets::show_alert(title="Loading",
-                               text = "Please wait",
-                               type="info",
-                               closeOnClickOutside = FALSE,
-                               showCloseButton = FALSE)
       model_path<-model_path_train_LM()
+      print(model_path)
+      if(!is.null(model_path) &
+         !identical(model_path,character(0))){
+        shinyWidgets::show_alert(title="Loading",
+                                 text = "Please wait",
+                                 type="info",
+                                 closeOnClickOutside = FALSE,
+                                 showCloseButton = FALSE)
+      }
       if(!is.null(model_path)){
         if(file.exists(paste0(model_path,
                               "/",
@@ -2257,23 +2263,39 @@ start_aifeducation_studio<-function(){
         model<-transformers$TFAutoModel$from_pretrained(model_path)
         model_architecture<-model$config$architectures
         max_position_embeddings=model$config$max_position_embeddings
+        if(model_architecture=="FunnelForMaskedLM"){
+          max_layer=sum(model$config$block_repeats*model$config$block_sizes)
+        } else {
+          max_layer=model$config$num_hidden_layers
+        }
       } else if(file.exists(paste0(model_path,
                                    "/",
                                    "pytorch_model.bin"))){
         model<-transformers$AutoModel$from_pretrained(model_path)
         model_architecture<-model$config$architectures
         max_position_embeddings=model$config$max_position_embeddings
+        if(model_architecture=="FunnelForMaskedLM"){
+          max_layer=sum(model$config$block_repeats*model$config$block_sizes)
+        } else {
+          max_layer=model$config$num_hidden_layers
+        }
       } else if(file.exists(paste0(model_path,
                                   "/",
                                   "model.safetensors"))){
         model<-transformers$AutoModel$from_pretrained(model_path)
         model_architecture<-model$config$architectures
         max_position_embeddings=model$config$max_position_embeddings
+        if(model_architecture=="FunnelForMaskedLM"){
+          max_layer=sum(model$config$block_repeats*model$config$block_sizes)
+        } else {
+          max_layer=model$config$num_hidden_layers
+        }
       } else {
         model_architecture=NULL
         max_position_embeddings=NULL
+        max_layer=NULL
       }
-      return(list(model_architecture,max_position_embeddings))
+      return(list(model_architecture,max_position_embeddings,max_layer))
     })
 
     shiny::observe({
@@ -2290,6 +2312,16 @@ start_aifeducation_studio<-function(){
 
     output$lm_interface_setting<-shiny::renderUI({
       if(length(interface_architecture()[[2]])>0){
+
+        max_layer_transformer=interface_architecture()[[3]]
+
+        if(interface_architecture()[[1]]=="FunnelForMaskedLM"|
+           interface_architecture()[[1]]=="FunnelModel"){
+          pool_type_choices=c("cls")
+        } else {
+          pool_type_choices=c("average","cls")
+        }
+
         ui<-shinydashboard::box(title = "Interface Setting",
                 width = 12,
                 solidHeader = TRUE,
@@ -2324,13 +2356,19 @@ start_aifeducation_studio<-function(){
                                      min = 0,
                                      max= interface_architecture()[[2]],
                                      step = 1),
-                         shiny::selectInput(inputId = "lm_aggregation",
-                                     label = "Aggregation Hidden States",
-                                     choices = c("last",
-                                                 "second_to_last",
-                                                 "fourth_to_last",
-                                                 "all",
-                                                 "last_four"))
+                         shiny::sliderInput(inputId = "lm_emb_layers",
+                                     label = "Layers for Embeddings",
+                                     value=c(
+                                       max(1,floor(0.5*max_layer_transformer)),
+                                       max(1,floor(2/3*max_layer_transformer))),
+                                     min=1,
+                                     max=max_layer_transformer,
+                                     step=1),
+                         shiny::selectInput(inputId = "lm_emb_pool_type",
+                                            label=paste("Pooling Type"),
+                                            choices=pool_type_choices,
+                                            multiple=FALSE
+                                            ),
                   )
                 )
         )
@@ -2372,17 +2410,22 @@ start_aifeducation_studio<-function(){
 
     #Create the interface
     shiny::observeEvent(input$lm_save_interface,{
-      model_architecture=interface_architecture()[1]
+      model_architecture=interface_architecture()[[1]]
       print(model_architecture)
-      if(model_architecture=="BertForMaskedLM"){
+      if(model_architecture=="BertForMaskedLM"|
+         model_architecture=="BertModel"){
         method="bert"
-      } else if(model_architecture=="FunnelForMaskedLM"){
+      } else if(model_architecture=="FunnelForMaskedLM"|
+                model_architecture=="FunnelModel"){
         method="funnel"
-      } else if(model_architecture=="LongformerForMaskedLM"){
+      } else if(model_architecture=="LongformerForMaskedLM"|
+                model_architecture=="LongformerModel"){
         method="longformer"
-      } else if(model_architecture=="RobertaForMaskedLM"){
+      } else if(model_architecture=="RobertaForMaskedLM"|
+                model_architecture=="RobertaModel"){
         method="roberta"
-      } else if(model_architecture=="DebertaV2ForMaskedLM"){
+      } else if(model_architecture=="DebertaV2ForMaskedLM"|
+                model_architecture=="DebertaV2Model"){
         method="deberta_v2"
       }
 
@@ -2407,7 +2450,9 @@ start_aifeducation_studio<-function(){
           max_length = input$lm_max_length,
           overlap = input$lm_overlap,
           chunks = input$lm_chunks,
-          aggregation = input$lm_aggregation,
+          emb_layer_min=input$lm_emb_layers[1],
+          emb_layer_max=input$lm_emb_layers[2],
+          emb_pool_type=input$lm_emb_pool_type,
           ml_framework = input$config_ml_framework,
           model_dir = model_path_interface_LM(),
           method = method)
@@ -2764,7 +2809,16 @@ start_aifeducation_studio<-function(){
                                      shiny::tags$p("Token Overlap: ",model$get_transformer_components()$overlap),
                                      shiny::tags$p("Max Tokens: ",(model$get_model_info()$model_max_size-model$get_transformer_components()$overlap)
                                             *model$get_transformer_components()$chunks+model$get_model_info()$model_max_size),
-                                     shiny::tags$p("Hidden States Aggregation: ",model$get_transformer_components()$aggregation),
+                                     if(!is.null(model$get_transformer_components()$aggregation)){
+                                       shiny::tags$p("Hidden States Aggregation: ",model$get_transformer_components()$aggregation)
+                                     },
+                                     if(!is.null(model$get_transformer_components()$emb_pool_type)){
+                                       shiny::tags$div(
+                                         shiny::tags$p("Pool Type: ",model$get_transformer_components()$emb_pool_type),
+                                         shiny::tags$p("Embedding Layers - Min: ",model$get_transformer_components()$emb_layer_min),
+                                         shiny::tags$p("Embedding Layers - Max: ",model$get_transformer_components()$emb_layer_max)
+                                       )
+                                     },
                                      shiny::tags$h3("Sustainability"),
                                      if(methods::isClass(Class="data.frame",where = model$get_sustainability_data())){
                                        if(is.na(model$get_sustainability_data()[1,1])==FALSE){
@@ -2787,6 +2841,37 @@ start_aifeducation_studio<-function(){
                               )
                             )
 
+                   ),
+                   #Language Model Training------------------------------------------
+                   shiny::tabPanel("Training",
+                                   shiny::fluidRow(
+                       shinydashboard::box(title = "Training",
+                                           solidHeader = TRUE,
+                                           status = "primary",
+                                           width = 12,
+                                           shiny::sidebarLayout(
+                                             position="right",
+                                             sidebarPanel=shiny::sidebarPanel(
+                                               shiny::sliderInput(inputId = "lm_performance_text_size",
+                                                                  label = "Text Size",
+                                                                  min = 1,
+                                                                  max = 20,
+                                                                  step = 0.5,
+                                                                  value = 12),
+                                               shiny::numericInput(inputId = "lm_performance_y_min",
+                                                                   label = "Y Min",
+                                                                   value = 0),
+                                               shiny::numericInput(inputId = "lm_performance_y_max",
+                                                                   label = "Y Max",
+                                                                   value = 20),
+                                             ),
+                                             mainPanel =shiny::mainPanel(
+                                               shiny::plotOutput(outputId = "lm_performance_training_loss")
+                                             )
+                                           )
+                              )
+                            )
+
                    ),
                    #Create Text Embeddings---------------------------------------------
                    shiny::tabPanel("Create Text Embeddings",
@@ -2940,6 +3025,38 @@ start_aifeducation_studio<-function(){
       }
     })
 
+        output$lm_performance_training_loss<-shiny::renderPlot({
+          plot_data=LanguageModel_for_Use()$last_training$history
+
+          if(!is.null(plot_data)){
+            y_min=input$lm_performance_y_min
+            y_max=input$lm_performance_y_max
+
+            val_loss_min=min(plot_data$val_loss)
+            best_model_epoch=which(x=(plot_data$val_loss)==val_loss_min)
+
+            plot<-ggplot2::ggplot(data=plot_data)+
+              ggplot2::geom_line(ggplot2::aes(x=.data$epoch,y=.data$loss,color="train"))+
+              ggplot2::geom_line(ggplot2::aes(x=.data$epoch,y=.data$val_loss,color="validation"))+
+              ggplot2::geom_vline(xintercept = best_model_epoch,
+                                  linetype="dashed")
+
+            plot=plot+ggplot2::theme_classic()+
+              ggplot2::ylab("value")+
+              ggplot2::coord_cartesian(ylim=c(y_min,y_max))+
+              ggplot2::xlab("epoch")+
+              ggplot2::scale_color_manual(values = c("train"="red",
+                                                     "validation"="blue",
+                                                     "test"="darkgreen"))+
+              ggplot2::theme(text = ggplot2::element_text(size = input$lm_performance_text_size),
+                             legend.position="bottom")
+            return(plot)
+          } else {
+            return(NULL)
+          }
+        },res = 72*2)
+
+
     #Document Page--------------------------------------------------------------
     shinyFiles::shinyDirChoose(input=input,
                    id="lm_db_select_model_for_documentation",