From 18f9ad6060b99b9d7c56790670515c557f45be9d Mon Sep 17 00:00:00 2001
From: Florian Berding <florian.berding@uni-hamburg.de>
Date: Mon, 12 Jun 2023 10:49:55 +0200
Subject: [PATCH] Bug Fixes

---
 .Rbuildignore                           |   3 +-
 .Rhistory                               | 146 ++++++++++++------------
 .github/workflows/R-CMD-check.yaml      |  10 +-
 R/aux_fct.R                             | 101 +++++++++++++---
 R/install_and_config.R                  |   5 +-
 R/te_classifier_neuralnet_model.R       |  12 +-
 R/text_embedding_model.R                |   1 +
 man/TextEmbeddingClassifierNeuralNet.Rd |   5 +-
 man/generate_id.Rd                      |  18 +++
 9 files changed, 200 insertions(+), 101 deletions(-)
 create mode 100644 man/generate_id.Rd

diff --git a/.Rbuildignore b/.Rbuildignore
index 49c187e..b078fc7 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -5,7 +5,8 @@ Trial
 materials
 test_model
 tests/testthat/test_data/language_models
-test/testthat/test_data/tmp
+test/testthat/test_data/tmp/checkpoints
+.h5
 .git
 .gitignore
 _pkgdown.yml
diff --git a/.Rhistory b/.Rhistory
index 1f26895..fa5bfee 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,76 +1,3 @@
-num_hidden_layer=12,
-num_attention_heads=12,
-intermediate_size=3072,
-hidden_act="gelu",
-hidden_dropout_prob=0.1,
-trace=FALSE))
-expect_no_error(
-create_roberta_model(
-model_dir=testthat::test_path("test_data/roberta"),
-vocab_raw_texts=example_data$text,
-vocab_size=30522,
-add_prefix_space=TRUE,
-max_position_embeddings=512,
-hidden_size=768,
-num_hidden_layer=12,
-num_attention_heads=12,
-intermediate_size=3072,
-hidden_act="gelu",
-hidden_dropout_prob=0.1,
-trace=FALSE))
-})
-test_that("train_tune_roberta_model", {
-example_data<-data.frame(
-id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1,
-label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
-example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
-expect_no_error(
-train_tune_roberta_model(output_dir=testthat::test_path("test_data/roberta"),
-model_dir_path=testthat::test_path("test_data/roberta"),
-raw_texts= example_data$text[1:25],
-p_mask=0.30,
-val_size=0.1,
-n_epoch=1,
-batch_size=1,
-chunk_size=512,
-n_workers=1,
-multi_process=FALSE,
-trace=FALSE))
-})
-test_that("train_tune_roberta_model", {
-example_data<-data.frame(
-id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1,
-label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
-example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
-expect_no_error(
-train_tune_roberta_model(output_dir=testthat::test_path("test_data/roberta"),
-model_dir_path=testthat::test_path("test_data/roberta"),
-raw_texts= example_data$text[1:25],
-p_mask=0.30,
-val_size=0.1,
-n_epoch=1,
-batch_size=1,
-chunk_size=510,
-n_workers=1,
-multi_process=FALSE,
-trace=FALSE))
-})
-test_embedding<-TextEmbeddingModel$new(
-model_name = "test",
-model_label = "test",
-model_version = "0.0.1",
-model_language = "german",
-model_dir="Trial/Bert_Modelle",
-method = "bert",
-aggregation = "last",
-max_length=256,
-chunks = 4,
-overlap = 10
-)
-test_embeddin$
-test_embedding$transformer_components$model$config$max_length
-test_embedding$transformer_components$model$config$max_position_embeddings
-devtools::load_all()
 test_that("train_tune_roberta_model", {
 example_data<-data.frame(
 id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1,
@@ -510,3 +437,76 @@ devtools::test()
 devtools::document()
 devtools::check()
 reticulate::py_config()
+devtools::test()
+sample_values=c(
+"a","A",
+"b","B",
+"c","C",
+"d","D",
+"e","E",
+"f","F",
+"g","G",
+"h","H",
+"i","I",
+"j","J",
+"k","K",
+"l","L",
+"m","M",
+"n","N",
+"o","O",
+"p","P",
+"q","Q",
+"r","R",
+"s","S",
+"t","T",
+"u","U",
+"v","V",
+"w","W",
+"x","X",
+"y","Y",
+"z","Z",
+seq(from=0,to=9,by=1)
+)
+sample_values
+length=10
+id_suffix=NULL
+sample_values=c(
+"a","A",
+"b","B",
+"c","C",
+"d","D",
+"e","E",
+"f","F",
+"g","G",
+"h","H",
+"i","I",
+"j","J",
+"k","K",
+"l","L",
+"m","M",
+"n","N",
+"o","O",
+"p","P",
+"q","Q",
+"r","R",
+"s","S",
+"t","T",
+"u","U",
+"v","V",
+"w","W",
+"x","X",
+"y","Y",
+"z","Z",
+seq(from=0,to=9,by=1)
+)
+id_suffix=sample(
+x=sample_values,
+size = length,
+replace = TRUE)
+id_suffix
+id_suffix=paste(id_suffix,collapse = "")
+id_suffix
+devtools::document()
+devtools::document()
+devtools::document()
+devtools::document()
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
index 152ae79..c014785 100644
--- a/.github/workflows/R-CMD-check.yaml
+++ b/.github/workflows/R-CMD-check.yaml
@@ -61,10 +61,14 @@ jobs:
                       "tensorflow")
 
           library(reticulate)
-          virtualenv_create("r-reticulate", Sys.which("python"))
-          virtualenv_install("r-reticulate", python_packages)
+          #virtualenv_create("r-reticulate", Sys.which("python"))
+          #virtualenv_install("r-reticulate", python_packages)
 
-          path_to_python <- virtualenv_python("r-reticulate")
+          conda_create("r-reticulate", Sys.which("python"))
+          conda_install("r-reticulate", python_packages)
+
+          #path_to_python <- virtualenv_python("r-reticulate")
+          path_to_python <- conda_python("r-reticulate")
           writeLines(sprintf("RETICULATE_PYTHON=%s", path_to_python),
                      Sys.getenv("GITHUB_ENV"))
 
diff --git a/R/aux_fct.R b/R/aux_fct.R
index 9dca274..b313959 100644
--- a/R/aux_fct.R
+++ b/R/aux_fct.R
@@ -336,29 +336,45 @@ get_folds<-function(target,
     fin_k_folds=k_folds
   }
 
-  val_sample=NULL
+  final_assignments=NULL
   for(cat in categories){
-    all_names=names(subset(target,target==cat))
-    used_names=NULL
-    tmp_regular_size=ceiling(length(all_names)/fin_k_folds)
+    condition=(sample_target==cat)
+    focused_targets=subset(x = sample_target,
+                           subset = condition)
+    n_cases=length(focused_targets)
 
-    for(i in 1:fin_k_folds){
-      if(i==1){
-        possible_names=all_names
-      } else {
-        possible_names=setdiff(x=all_names,
-                               y=used_names)
-      }
-      tmp_size=min(length(possible_names),tmp_regular_size)
-      selected_names<-sample(x=possible_names,
-                             size=tmp_size,
-                             replace=FALSE)
-      val_sample[i]=list(append(x=unlist(val_sample[i]),
-                           values = selected_names))
-      used_names=append(used_names,values = selected_names)
+    cases_per_fold=vector(length = fin_k_folds)
+    cases_per_fold[]=ceiling(n_cases/fin_k_folds)
+
+    delta=sum(cases_per_fold)-n_cases
+    for(i in 1:delta){
+      cases_per_fold[1+(i-1)%%fin_k_folds]=cases_per_fold[1+(i-1)%%fin_k_folds]-1
     }
+
+    possible_assignments=NULL
+    for(i in 1:length(cases_per_fold))
+      possible_assignments=append(
+        x=possible_assignments,
+        values=rep.int(x=i,
+                       times = cases_per_fold[i])
+      )
+
+    assignments<-sample(
+      x=possible_assignments,
+      size=length(possible_assignments),
+      replace = FALSE
+    )
+    names(assignments)=names(focused_targets)
+    final_assignments=append(x=final_assignments,
+                             values=assignments)
   }
 
+  val_sample=NULL
+  for(i in 1:fin_k_folds){
+    condition=(final_assignments==i)
+    val_sample[i]=list(names(subset(x=final_assignments,
+                               subset=condition)))
+  }
 
   train_sample=NULL
   for(i in 1:fin_k_folds){
@@ -773,3 +789,52 @@ get_n_chunks<-function(text_embeddings,features,times){
   names(n_chunks)<-rownames(text_embeddings)
   return(n_chunks)
 }
+
+#------------------------------------------------------------------------------
+#'Generate ID Suffix for Objects.
+#'
+#'Function for generating an ID suffix for objects of class
+#'\link{TextEmbeddingModel} and \link{TextEmbeddingClassifierNeuralNet}.
+#'
+#'@param length \code{int} determining the length of the id suffix.
+#'@return Returns a \code{string} of the requested length
+generate_id<-function(length=16){
+  id_suffix=NULL
+  sample_values=c(
+    "a","A",
+    "b","B",
+    "c","C",
+    "d","D",
+    "e","E",
+    "f","F",
+    "g","G",
+    "h","H",
+    "i","I",
+    "j","J",
+    "k","K",
+    "l","L",
+    "m","M",
+    "n","N",
+    "o","O",
+    "p","P",
+    "q","Q",
+    "r","R",
+    "s","S",
+    "t","T",
+    "u","U",
+    "v","V",
+    "w","W",
+    "x","X",
+    "y","Y",
+    "z","Z",
+    seq(from=0,to=9,by=1)
+  )
+
+
+    id_suffix=sample(
+      x=sample_values,
+      size = length,
+      replace = TRUE)
+    id_suffix=paste(id_suffix,collapse = "")
+    return(id_suffix)
+}
diff --git a/R/install_and_config.R b/R/install_and_config.R
index f4036b2..7f6668c 100644
--- a/R/install_and_config.R
+++ b/R/install_and_config.R
@@ -6,8 +6,7 @@
 #'be installed.
 #'@export
 install_py_modules<-function(envname="aifeducation"){
-  relevant_modules<-c("os",
-                      "transformers",
+  relevant_modules<-c("transformers",
                       "tokenizers",
                       "datasets",
                       "torch",
@@ -103,7 +102,7 @@ set_config_gpu_low_memory<-function(){
 #'@export
 set_config_tf_logger<-function(level="ERROR"){
   logger<-tf$get_logger()
-   logger$setLevel(level)
+  logger$setLevel(level)
 }
 
 #'Sets the level for logging information in tensor flow.
diff --git a/R/te_classifier_neuralnet_model.R b/R/te_classifier_neuralnet_model.R
index 9a384e0..b081c57 100644
--- a/R/te_classifier_neuralnet_model.R
+++ b/R/te_classifier_neuralnet_model.R
@@ -11,6 +11,10 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class(
     #'Name of the classifier.
     name=NULL,
 
+    #'@field name_root ('character()')\cr
+    #'Root part of the name of the classifier.
+    name_root=NULL,
+
     #'@field label ('character()')\cr
     #'Label of the classifier used as the individual title.
     label=NULL,
@@ -244,7 +248,8 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class(
       #------------------------------------------------------------------------
 
       #Setting Label and Name
-      self$name=name
+      self$name_root=name
+      self$name=paste0(self$name_root,"_id_",generate_id(16))
       self$label=label
 
       #Basic Information of Input and Target Data
@@ -759,6 +764,9 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class(
 
       names_unlabeled=names(subset(data_targets,is.na(data_targets)==TRUE))
 
+      #Setting a new ID for the classifier
+      self$name=paste0(self$name_root,"_id_",generate_id(16))
+
       for(iter in 1:folds$n_folds){
         #---------------------------------------------
         #Create a Train and Validation Sample
@@ -1790,7 +1798,7 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class(
     #'@description Method for setting the license of the classifier.
     #'@param license \code{string} containing the abbreviation of the license or
     #'the license text.
-    set_license=function(license){
+    set_license=function(license="CC BY-NC-SA"){
       private$model_info$model_license<-license
     },
     #'@description Method for setting the license of the classifier.
diff --git a/R/text_embedding_model.R b/R/text_embedding_model.R
index a2620f6..f8c7053 100644
--- a/R/text_embedding_model.R
+++ b/R/text_embedding_model.R
@@ -943,6 +943,7 @@ TextEmbeddingModel<-R6::R6Class(
       return(list(
         model_license=private$model_info$model_license,
         model_name=private$model_info$model_name,
+        model_label=private$model_info$model_label,
         model_date=private$model_info$model_date,
         model_version=private$model_info$model_version,
         model_language=private$model_info$model_language
diff --git a/man/TextEmbeddingClassifierNeuralNet.Rd b/man/TextEmbeddingClassifierNeuralNet.Rd
index 262549c..c91efa8 100644
--- a/man/TextEmbeddingClassifierNeuralNet.Rd
+++ b/man/TextEmbeddingClassifierNeuralNet.Rd
@@ -13,6 +13,9 @@ tensorflow
 \item{\code{name}}{('character()')\cr
 Name of the classifier.}
 
+\item{\code{name_root}}{('character()')\cr
+Root part of the name of the classifier.}
+
 \item{\code{label}}{('character()')\cr
 Label of the classifier used as the individual title.}
 
@@ -443,7 +446,7 @@ Method for requesting the bibliographic information of the classifier.
 \subsection{Method \code{set_license()}}{
 Method for setting the license of the classifier.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TextEmbeddingClassifierNeuralNet$set_license(license)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{TextEmbeddingClassifierNeuralNet$set_license(license = "CC BY-NC-SA")}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
diff --git a/man/generate_id.Rd b/man/generate_id.Rd
new file mode 100644
index 0000000..9839843
--- /dev/null
+++ b/man/generate_id.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aux_fct.R
+\name{generate_id}
+\alias{generate_id}
+\title{Generate ID Suffix for Objects.}
+\usage{
+generate_id(length = 16)
+}
+\arguments{
+\item{length}{\code{int} determining the length of the id suffix.}
+}
+\value{
+Returns a \code{string} of the requested length
+}
+\description{
+Function for generating an ID suffix for objects of class
+\link{TextEmbeddingModel} and \link{TextEmbeddingClassifierNeuralNet}.
+}