Fix Transformer Trace

Fix Classifier Checking
FBerding · Apr 9, 2024 · f1e5883 · f1e5883
1 parent 7be7686
commit f1e5883
Show file tree

Hide file tree

Showing 24 changed files with 631 additions and 543 deletions.
diff --git a/.Rhistory b/.Rhistory
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -2,9 +2,9 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, master, version_0_3_2_dev]
+    branches: [main, master, version_0_3_3_dev]
   pull_request:
-    branches: [main, master, version_0_3_2_dev]
+    branches: [main, master, version_0_3_3_dev]
 
 name: R-CMD-check
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: aifeducation
 Title: Artificial Intelligence for Education
-Version: 0.3.2
+Version: 0.3.3
 Authors@R: c(
     person("Berding", "Florian", , "florian.berding@uni-hamburg.de", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-3593-1695")),
@@ -61,7 +61,6 @@ Suggests:
     topicmodels,
     udpipe,
     quanteda,
-    quanteda.textmodels,
     knitr,
     rmarkdown,
     testthat (>= 3.0.0),

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,17 @@ editor_options:
   markdown: 
     wrap: 72
 ---
+# aifeducation 0.3.3
+
+**Graphical User Interface Aifeducation Studio**
+
+- Fixed a bug concerning the ids of .pdf and .csv files. Now the ids are correctly
+  saved within a text collection file.
+
+**Further Changes**
+
+- Removed quanteda.textmodels as necessary library for testing the package.
+- Added a dataset for testing the package based on Maas et al. (2011).
 
 # aifeducation 0.3.2
 

diff --git a/R/aif_gui.R b/R/aif_gui.R
@@ -93,6 +93,7 @@ start_aifeducation_studio<-function(){
   set_transformers_logger(level="ERROR")
   #Disable tqdm progressbar
   transformers$logging$disable_progress_bar()
+  datasets$disable_progress_bars()
 
   #Start GUI--------------------------------------------------------------------
   options(shiny.reactlog=TRUE)
@@ -1091,7 +1092,7 @@ start_aifeducation_studio<-function(){
       }
       if(input$dp_include_csv==FALSE &
          input$dp_include_pdf==FALSE &
-         input$dp_include_csv==FALSE){
+         input$dp_include_xlsx==FALSE){
         error_list[length(error_list)+1]="No file types selected. Please select
       at least one file type."
       }
@@ -1191,7 +1192,7 @@ start_aifeducation_studio<-function(){
             #File name without extension
             #text_corpus[counter,"id"]=stringi::stri_split_fixed(tmp_document$doc_id,pattern=".")[[1]][1]
             tmp_string=stringr::str_split_fixed(tmp_document$doc_id,pattern="\\.",n=Inf)
-            text_corpus[counter,"id"]=tmp_string[1,ncol(tmp_string)]
+            text_corpus[counter,"id"]=paste0(tmp_string[1,1:(ncol(tmp_string)-1)],collapse = ".")
             text_corpus[counter,"text"]=tmp_document$text
             counter=counter+1
 

diff --git a/R/data.R b/R/data.R
@@ -17,3 +17,25 @@
 #' @format list
 #' @keywords internal
 "test_classifier_sustainability"
+
+#' Standford Movie Review Dataset
+#'
+#' A \code{data.frame} consisting of a subset of 100 negative and 200 positive
+#' movie reviews from the dataset provided by Maas et al. (2011).
+#' The \code{data.frame} consists of three columns. The first column 'text' stores
+#' the movie review. The second stores the labels (0 = negative, 1 = positive). The
+#' last column stores the id.
+#' The purpose of the data is for illustration in vignettes.
+#'
+#' @docType data
+#' @format data.frame
+#' @keywords internal
+#' @references Maas, A. L., Daly, R. E., Pham, P. T., Huang, D.,
+#' Ng, A. Y., & Potts, C. (2011). Learning Word Vectors for Sentiment
+#' Analysis. In D. Lin, Y. Matsumoto, & R. Mihalcea (Eds.),
+#' Proceedings of the 49th Annual Meeting of the Association for
+#' Computational Linguistics: Human Language Technologies (pp. 142–150).
+#' Association for Computational Linguistics.
+#' https://aclanthology.org/P11-1015
+#'
+"imdb_movie_reviews"
diff --git a/R/te_classifier_neuralnet_model.R b/R/te_classifier_neuralnet_model.R
@@ -1924,7 +1924,16 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class(
 
       embedding_model_config<-text_embeddings$get_model_info()
       for(check in names(embedding_model_config)){
-        if(embedding_model_config[[check]]!=private$text_embedding_model$model[[check]]){
+        if(!is.null_or_na(embedding_model_config[[check]]) &
+           !is.null_or_na(private$text_embedding_model$model[[check]])){
+          if(embedding_model_config[[check]]!=private$text_embedding_model$model[[check]]){
+            return(FALSE)
+          }
+        } else if (!is.null_or_na(embedding_model_config[[check]]) &
+                   is.null_or_na(private$text_embedding_model$model[[check]])){
+          return(FALSE)
+        } else if (is.null_or_na(embedding_model_config[[check]]) &
+                   !is.null_or_na(private$text_embedding_model$model[[check]])){
           return(FALSE)
         }
       }

diff --git a/R/transformer_bert.R b/R/transformer_bert.R
@@ -836,6 +836,10 @@ train_tune_bert_model=function(ml_framework=aifeducation_config$get_framework(),
       data_collator = data_collator,
       tokenizer = tokenizer)
     trainer$remove_callback(transformers$integrations$CodeCarbonCallback)
+    if(as.logical(pytorch_trace)==FALSE){
+      trainer$remove_callback(transformers$PrinterCallback)
+      trainer$remove_callback(transformers$ProgressCallback)
+    }
 
     #Load Custom Callbacks
 

diff --git a/R/transformer_deberta_v2.R b/R/transformer_deberta_v2.R
@@ -874,6 +874,10 @@ train_tune_deberta_v2_model=function(ml_framework=aifeducation_config$get_framew
       tokenizer = tokenizer
     )
     trainer$remove_callback(transformers$integrations$CodeCarbonCallback)
+    if(as.logical(pytorch_trace)==FALSE){
+      trainer$remove_callback(transformers$PrinterCallback)
+      trainer$remove_callback(transformers$ProgressCallback)
+    }
 
     #Add Callback if Shiny App is running
     if(requireNamespace("shiny") & requireNamespace("shinyWidgets")){

diff --git a/R/transformer_funnel.R b/R/transformer_funnel.R
@@ -850,6 +850,10 @@ train_tune_funnel_model=function(ml_framework=aifeducation_config$get_framework(
       tokenizer = tokenizer
     )
     trainer$remove_callback(transformers$integrations$CodeCarbonCallback)
+    if(as.logical(pytorch_trace)==FALSE){
+      trainer$remove_callback(transformers$PrinterCallback)
+      trainer$remove_callback(transformers$ProgressCallback)
+    }
 
     #Add Callback if Shiny App is running
     if(requireNamespace("shiny") & requireNamespace("shinyWidgets")){

diff --git a/R/transformer_longformer.R b/R/transformer_longformer.R
@@ -784,6 +784,10 @@ train_tune_longformer_model=function(ml_framework=aifeducation_config$get_framew
       tokenizer = tokenizer
     )
     trainer$remove_callback(transformers$integrations$CodeCarbonCallback)
+    if(as.logical(pytorch_trace)==FALSE){
+      trainer$remove_callback(transformers$PrinterCallback)
+      trainer$remove_callback(transformers$ProgressCallback)
+    }
 
     #Add Callback if Shiny App is running
     if(requireNamespace("shiny") & requireNamespace("shinyWidgets")){

diff --git a/R/transformer_roberta.R b/R/transformer_roberta.R
@@ -793,6 +793,10 @@ train_tune_roberta_model=function(ml_framework=aifeducation_config$get_framework
       tokenizer = tokenizer
     )
     trainer$remove_callback(transformers$integrations$CodeCarbonCallback)
+    if(as.logical(pytorch_trace)==FALSE){
+      trainer$remove_callback(transformers$PrinterCallback)
+      trainer$remove_callback(transformers$ProgressCallback)
+    }
 
     #Add Callback if Shiny App is running
     if(requireNamespace("shiny") & requireNamespace("shinyWidgets")){

diff --git a/README.Rmd b/README.Rmd
@@ -21,7 +21,7 @@ knitr::opts_chunk$set(
 <!-- badges: start -->
 
 **GitHub**
-[![](https://img.shields.io/badge/devel%20version-0.3.1-green.svg)](https://github.com/fberding/iotarelr)
+[![](https://img.shields.io/badge/devel%20version-0.3.3-green.svg)](https://github.com/fberding/iotarelr)
 [![R-CMD-check](https://github.com/FBerding/aifeducation/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/FBerding/aifeducation/actions/workflows/R-CMD-check.yaml)
 **CRAN** [![CRAN
 status](https://www.r-pkg.org/badges/version/aifeducation)](https://CRAN.R-project.org/package=aifeducation)

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 <!-- badges: start -->
 
 **GitHub**
-[![](https://img.shields.io/badge/devel%20version-0.3.1-green.svg)](https://github.com/fberding/iotarelr)
+[![](https://img.shields.io/badge/devel%20version-0.3.3-green.svg)](https://github.com/fberding/iotarelr)
 [![R-CMD-check](https://github.com/FBerding/aifeducation/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/FBerding/aifeducation/actions/workflows/R-CMD-check.yaml)
 **CRAN** [![CRAN
 status](https://www.r-pkg.org/badges/version/aifeducation)](https://CRAN.R-project.org/package=aifeducation)
@@ -142,6 +142,8 @@ following table provides more details:
 | Longformer                | Yes     | Yes        | Yes            |
 | Text Embedding Classifier | Yes     | Yes        | No             |
 
+Please not that tensorflow is currently supported up to version 2.15.
+
 ## Classification Tasks
 
 ### Transforming Texts into Numbers

diff --git a/data/imdb_movie_reviews.rda b/data/imdb_movie_reviews.rda
diff --git a/man/get_n_chunks.Rd b/man/get_n_chunks.Rd
diff --git a/man/get_train_test_split.Rd b/man/get_train_test_split.Rd
diff --git a/man/imdb_movie_reviews.Rd b/man/imdb_movie_reviews.Rd
diff --git a/tests/testthat/test-01_vocab_draft.R b/tests/testthat/test-01_vocab_draft.R
@@ -1,15 +1,14 @@
 testthat::skip_on_cran()
+testthat::skip_if_not_installed(pkg="quanteda")
+testthat::skip_if_not_installed(pkg="udpipe")
 
 tmp_path="test_data/language_models/udpipe_models/english-ewt-ud-2.5-191206.udpipe"
 tmp_condition=file.exists(testthat::test_path(tmp_path))
 testthat::skip_if_not(condition=tmp_condition,
                   message = "udpipe language model not available")
 
 test_that("bow_pp_create_vocab_draft", {
-  example_data<-data.frame(
-    id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1,
-    label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
-  example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
+  example_data<-imdb_movie_reviews
 
   res<-bow_pp_create_vocab_draft(
     path_language_model=testthat::test_path(tmp_path),

diff --git a/tests/testthat/test-02_basic_text_rep.R b/tests/testthat/test-02_basic_text_rep.R
@@ -1,5 +1,9 @@
 
 testthat::skip_on_os("windows")
+testthat::skip_if_not_installed(pkg="quanteda")
+testthat::skip_if_not_installed(pkg="topicmodels")
+testthat::skip_if_not_installed(pkg="text2vec")
+testthat::skip_if_not_installed(pkg="tidytext")
 
 path="test_data/gvc_lda/vocab_draft_movie_review.rda"
 testthat::skip_if_not(condition=file.exists(testthat::test_path(path)),
@@ -10,10 +14,7 @@ load(testthat::test_path(path))
 #------------------------------------------------------------------------------
 test_that("bow_pp_create_vocab_draft", {
 
-  example_data<-data.frame(
-    id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1,
-    label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
-  example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
+  example_data<-imdb_movie_reviews
 
   res<-bow_pp_create_basic_text_rep(
      data=example_data$text[1:100],

diff --git a/tests/testthat/test-03_text_embedding_model_gvc_lda.R b/tests/testthat/test-03_text_embedding_model_gvc_lda.R
@@ -1,3 +1,4 @@
+testthat::skip_on_os("windows")
 path="test_data/gvc_lda/basic_text_rep_movie_reviews.rda"
 testthat::skip_if_not(condition=file.exists(testthat::test_path(path)),
                       message  = "Necessary dataset not available")
@@ -10,13 +11,12 @@ if(dir.exists(testthat::test_path("test_artefacts/tmp_full_models"))==FALSE){
   dir.create(testthat::test_path("test_artefacts/tmp_full_models"))
 }
 
+datasets$disable_progress_bars()
+
 #------------------------------------------------------------------------------
 load(testthat::test_path(path))
 
-example_data<-data.frame(
-  id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id2,
-  label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
-example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
+example_data<-imdb_movie_reviews
 #------------------------------------------------------------------------------
 
 global_vector_clusters_modeling<-TextEmbeddingModel$new(

diff --git a/tests/testthat/test-04_transformer_models.R b/tests/testthat/test-04_transformer_models.R
@@ -53,10 +53,7 @@ rows_susatainability["longformer"]=2
 rows_susatainability["deberta_v2"]=3
 
 
-example_data<-data.frame(
-  id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1,
-  label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
-example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
+example_data<-imdb_movie_reviews
 
 print(check_aif_py_modules())
 

diff --git a/tests/testthat/test-05_classifier_neural_net.R b/tests/testthat/test-05_classifier_neural_net.R
@@ -66,13 +66,10 @@ current_embeddings<-bert_embeddings$clone(deep = TRUE)
 
 for(framework in ml_frameworks){
   for (n_classes in 2:3){
-    example_data<-data.frame(
-      id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id2,
-      label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
-    example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
-    example_data$label<-as.character(example_data$label)
+    example_data<-imdb_movie_reviews
 
-    rownames(example_data)<-example_data$id
+    rownames(example_data)<-rownames(current_embeddings$embeddings)
+    example_data$id<-rownames(current_embeddings$embeddings)
     example_data<-example_data[intersect(
       rownames(example_data),rownames(current_embeddings$embeddings)),]
 

diff --git a/vignettes/classification_tasks.Rmd b/vignettes/classification_tasks.Rmd
@@ -444,7 +444,7 @@ basic_text_rep<-bow_pp_create_basic_text_rep(
   remove_separators = TRUE,
   split_hyphens = FALSE,
   split_tags = FALSE,
-  language_stopwords="eng",
+  language_stopwords="en",
   use_lemmata = FALSE,
   to_lower=FALSE,
   min_termfreq = NULL,
@@ -585,7 +585,7 @@ directory. The next step is to train your model by calling
 train_tune_bert_model(
   ml_framework=aifeducation_config$get_framework(),
   output_dir = "my_own_transformer_trained",
-  bert_model_dir_path = "my_own_transformer",
+  model_dir_path = "my_own_transformer",
   raw_texts = example_data$text,
   p_mask=0.15,
   whole_word=TRUE,