From 18f9ad6060b99b9d7c56790670515c557f45be9d Mon Sep 17 00:00:00 2001 From: Florian Berding Date: Mon, 12 Jun 2023 10:49:55 +0200 Subject: [PATCH] Bug Fixes --- .Rbuildignore | 3 +- .Rhistory | 146 ++++++++++++------------ .github/workflows/R-CMD-check.yaml | 10 +- R/aux_fct.R | 101 +++++++++++++--- R/install_and_config.R | 5 +- R/te_classifier_neuralnet_model.R | 12 +- R/text_embedding_model.R | 1 + man/TextEmbeddingClassifierNeuralNet.Rd | 5 +- man/generate_id.Rd | 18 +++ 9 files changed, 200 insertions(+), 101 deletions(-) create mode 100644 man/generate_id.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 49c187e..b078fc7 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,7 +5,8 @@ Trial materials test_model tests/testthat/test_data/language_models -test/testthat/test_data/tmp +test/testthat/test_data/tmp/checkpoints +.h5 .git .gitignore _pkgdown.yml diff --git a/.Rhistory b/.Rhistory index 1f26895..fa5bfee 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,76 +1,3 @@ -num_hidden_layer=12, -num_attention_heads=12, -intermediate_size=3072, -hidden_act="gelu", -hidden_dropout_prob=0.1, -trace=FALSE)) -expect_no_error( -create_roberta_model( -model_dir=testthat::test_path("test_data/roberta"), -vocab_raw_texts=example_data$text, -vocab_size=30522, -add_prefix_space=TRUE, -max_position_embeddings=512, -hidden_size=768, -num_hidden_layer=12, -num_attention_heads=12, -intermediate_size=3072, -hidden_act="gelu", -hidden_dropout_prob=0.1, -trace=FALSE)) -}) -test_that("train_tune_roberta_model", { -example_data<-data.frame( -id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1, -label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment) -example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews) -expect_no_error( -train_tune_roberta_model(output_dir=testthat::test_path("test_data/roberta"), -model_dir_path=testthat::test_path("test_data/roberta"), -raw_texts= example_data$text[1:25], -p_mask=0.30, -val_size=0.1, -n_epoch=1, -batch_size=1, -chunk_size=512, -n_workers=1, -multi_process=FALSE, -trace=FALSE)) -}) -test_that("train_tune_roberta_model", { -example_data<-data.frame( -id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1, -label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment) -example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews) -expect_no_error( -train_tune_roberta_model(output_dir=testthat::test_path("test_data/roberta"), -model_dir_path=testthat::test_path("test_data/roberta"), -raw_texts= example_data$text[1:25], -p_mask=0.30, -val_size=0.1, -n_epoch=1, -batch_size=1, -chunk_size=510, -n_workers=1, -multi_process=FALSE, -trace=FALSE)) -}) -test_embedding<-TextEmbeddingModel$new( -model_name = "test", -model_label = "test", -model_version = "0.0.1", -model_language = "german", -model_dir="Trial/Bert_Modelle", -method = "bert", -aggregation = "last", -max_length=256, -chunks = 4, -overlap = 10 -) -test_embeddin$ -test_embedding$transformer_components$model$config$max_length -test_embedding$transformer_components$model$config$max_position_embeddings -devtools::load_all() test_that("train_tune_roberta_model", { example_data<-data.frame( id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1, @@ -510,3 +437,76 @@ devtools::test() devtools::document() devtools::check() reticulate::py_config() +devtools::test() +sample_values=c( +"a","A", +"b","B", +"c","C", +"d","D", +"e","E", +"f","F", +"g","G", +"h","H", +"i","I", +"j","J", +"k","K", +"l","L", +"m","M", +"n","N", +"o","O", +"p","P", +"q","Q", +"r","R", +"s","S", +"t","T", +"u","U", +"v","V", +"w","W", +"x","X", +"y","Y", +"z","Z", +seq(from=0,to=9,by=1) +) +sample_values +length=10 +id_suffix=NULL +sample_values=c( +"a","A", +"b","B", +"c","C", +"d","D", +"e","E", +"f","F", +"g","G", +"h","H", +"i","I", +"j","J", +"k","K", +"l","L", +"m","M", +"n","N", +"o","O", +"p","P", +"q","Q", +"r","R", +"s","S", +"t","T", +"u","U", +"v","V", +"w","W", +"x","X", +"y","Y", +"z","Z", +seq(from=0,to=9,by=1) +) +id_suffix=sample( +x=sample_values, +size = length, +replace = TRUE) +id_suffix +id_suffix=paste(id_suffix,collapse = "") +id_suffix +devtools::document() +devtools::document() +devtools::document() +devtools::document() diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 152ae79..c014785 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -61,10 +61,14 @@ jobs: "tensorflow") library(reticulate) - virtualenv_create("r-reticulate", Sys.which("python")) - virtualenv_install("r-reticulate", python_packages) + #virtualenv_create("r-reticulate", Sys.which("python")) + #virtualenv_install("r-reticulate", python_packages) - path_to_python <- virtualenv_python("r-reticulate") + conda_create("r-reticulate", Sys.which("python")) + conda_install("r-reticulate", python_packages) + + #path_to_python <- virtualenv_python("r-reticulate") + path_to_python <- conda_python("r-reticulate") writeLines(sprintf("RETICULATE_PYTHON=%s", path_to_python), Sys.getenv("GITHUB_ENV")) diff --git a/R/aux_fct.R b/R/aux_fct.R index 9dca274..b313959 100644 --- a/R/aux_fct.R +++ b/R/aux_fct.R @@ -336,29 +336,45 @@ get_folds<-function(target, fin_k_folds=k_folds } - val_sample=NULL + final_assignments=NULL for(cat in categories){ - all_names=names(subset(target,target==cat)) - used_names=NULL - tmp_regular_size=ceiling(length(all_names)/fin_k_folds) + condition=(sample_target==cat) + focused_targets=subset(x = sample_target, + subset = condition) + n_cases=length(focused_targets) - for(i in 1:fin_k_folds){ - if(i==1){ - possible_names=all_names - } else { - possible_names=setdiff(x=all_names, - y=used_names) - } - tmp_size=min(length(possible_names),tmp_regular_size) - selected_names<-sample(x=possible_names, - size=tmp_size, - replace=FALSE) - val_sample[i]=list(append(x=unlist(val_sample[i]), - values = selected_names)) - used_names=append(used_names,values = selected_names) + cases_per_fold=vector(length = fin_k_folds) + cases_per_fold[]=ceiling(n_cases/fin_k_folds) + + delta=sum(cases_per_fold)-n_cases + for(i in 1:delta){ + cases_per_fold[1+(i-1)%%fin_k_folds]=cases_per_fold[1+(i-1)%%fin_k_folds]-1 } + + possible_assignments=NULL + for(i in 1:length(cases_per_fold)) + possible_assignments=append( + x=possible_assignments, + values=rep.int(x=i, + times = cases_per_fold[i]) + ) + + assignments<-sample( + x=possible_assignments, + size=length(possible_assignments), + replace = FALSE + ) + names(assignments)=names(focused_targets) + final_assignments=append(x=final_assignments, + values=assignments) } + val_sample=NULL + for(i in 1:fin_k_folds){ + condition=(final_assignments==i) + val_sample[i]=list(names(subset(x=final_assignments, + subset=condition))) + } train_sample=NULL for(i in 1:fin_k_folds){ @@ -773,3 +789,52 @@ get_n_chunks<-function(text_embeddings,features,times){ names(n_chunks)<-rownames(text_embeddings) return(n_chunks) } + +#------------------------------------------------------------------------------ +#'Generate ID Suffix for Objects. +#' +#'Function for generating an ID suffix for objects of class +#'\link{TextEmbeddingModel} and \link{TextEmbeddingClassifierNeuralNet}. +#' +#'@param length \code{int} determining the length of the id suffix. +#'@return Returns a \code{string} of the requested length +generate_id<-function(length=16){ + id_suffix=NULL + sample_values=c( + "a","A", + "b","B", + "c","C", + "d","D", + "e","E", + "f","F", + "g","G", + "h","H", + "i","I", + "j","J", + "k","K", + "l","L", + "m","M", + "n","N", + "o","O", + "p","P", + "q","Q", + "r","R", + "s","S", + "t","T", + "u","U", + "v","V", + "w","W", + "x","X", + "y","Y", + "z","Z", + seq(from=0,to=9,by=1) + ) + + + id_suffix=sample( + x=sample_values, + size = length, + replace = TRUE) + id_suffix=paste(id_suffix,collapse = "") + return(id_suffix) +} diff --git a/R/install_and_config.R b/R/install_and_config.R index f4036b2..7f6668c 100644 --- a/R/install_and_config.R +++ b/R/install_and_config.R @@ -6,8 +6,7 @@ #'be installed. #'@export install_py_modules<-function(envname="aifeducation"){ - relevant_modules<-c("os", - "transformers", + relevant_modules<-c("transformers", "tokenizers", "datasets", "torch", @@ -103,7 +102,7 @@ set_config_gpu_low_memory<-function(){ #'@export set_config_tf_logger<-function(level="ERROR"){ logger<-tf$get_logger() - logger$setLevel(level) + logger$setLevel(level) } #'Sets the level for logging information in tensor flow. diff --git a/R/te_classifier_neuralnet_model.R b/R/te_classifier_neuralnet_model.R index 9a384e0..b081c57 100644 --- a/R/te_classifier_neuralnet_model.R +++ b/R/te_classifier_neuralnet_model.R @@ -11,6 +11,10 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class( #'Name of the classifier. name=NULL, + #'@field name_root ('character()')\cr + #'Root part of the name of the classifier. + name_root=NULL, + #'@field label ('character()')\cr #'Label of the classifier used as the individual title. label=NULL, @@ -244,7 +248,8 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class( #------------------------------------------------------------------------ #Setting Label and Name - self$name=name + self$name_root=name + self$name=paste0(self$name_root,"_id_",generate_id(16)) self$label=label #Basic Information of Input and Target Data @@ -759,6 +764,9 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class( names_unlabeled=names(subset(data_targets,is.na(data_targets)==TRUE)) + #Setting a new ID for the classifier + self$name=paste0(self$name_root,"_id_",generate_id(16)) + for(iter in 1:folds$n_folds){ #--------------------------------------------- #Create a Train and Validation Sample @@ -1790,7 +1798,7 @@ TextEmbeddingClassifierNeuralNet<-R6::R6Class( #'@description Method for setting the license of the classifier. #'@param license \code{string} containing the abbreviation of the license or #'the license text. - set_license=function(license){ + set_license=function(license="CC BY-NC-SA"){ private$model_info$model_license<-license }, #'@description Method for setting the license of the classifier. diff --git a/R/text_embedding_model.R b/R/text_embedding_model.R index a2620f6..f8c7053 100644 --- a/R/text_embedding_model.R +++ b/R/text_embedding_model.R @@ -943,6 +943,7 @@ TextEmbeddingModel<-R6::R6Class( return(list( model_license=private$model_info$model_license, model_name=private$model_info$model_name, + model_label=private$model_info$model_label, model_date=private$model_info$model_date, model_version=private$model_info$model_version, model_language=private$model_info$model_language diff --git a/man/TextEmbeddingClassifierNeuralNet.Rd b/man/TextEmbeddingClassifierNeuralNet.Rd index 262549c..c91efa8 100644 --- a/man/TextEmbeddingClassifierNeuralNet.Rd +++ b/man/TextEmbeddingClassifierNeuralNet.Rd @@ -13,6 +13,9 @@ tensorflow \item{\code{name}}{('character()')\cr Name of the classifier.} +\item{\code{name_root}}{('character()')\cr +Root part of the name of the classifier.} + \item{\code{label}}{('character()')\cr Label of the classifier used as the individual title.} @@ -443,7 +446,7 @@ Method for requesting the bibliographic information of the classifier. \subsection{Method \code{set_license()}}{ Method for setting the license of the classifier. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TextEmbeddingClassifierNeuralNet$set_license(license)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{TextEmbeddingClassifierNeuralNet$set_license(license = "CC BY-NC-SA")}\if{html}{\out{
}} } \subsection{Arguments}{ diff --git a/man/generate_id.Rd b/man/generate_id.Rd new file mode 100644 index 0000000..9839843 --- /dev/null +++ b/man/generate_id.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aux_fct.R +\name{generate_id} +\alias{generate_id} +\title{Generate ID Suffix for Objects.} +\usage{ +generate_id(length = 16) +} +\arguments{ +\item{length}{\code{int} determining the length of the id suffix.} +} +\value{ +Returns a \code{string} of the requested length +} +\description{ +Function for generating an ID suffix for objects of class +\link{TextEmbeddingModel} and \link{TextEmbeddingClassifierNeuralNet}. +}