Skip to content

Commit

Permalink
Bug Fixes and update of Documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
FBerding committed Jun 13, 2023
1 parent fd895c7 commit ab736c1
Show file tree
Hide file tree
Showing 23 changed files with 416 additions and 291 deletions.
8 changes: 7 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@ Trial
materials
test_model
tests/testthat/test_data/language_models
test/testthat/test_data/tmp/checkpoints
test/testthat/test_data/tmp
test/testthat/test_data/bert/checkpoints
test/testthat/test_data/longformer/checkpoints
test/testthat/test_data/roberta/checkpoints
.h5
.git
.gitignore
_pkgdown.yml
docs
^vignettes/articles$
^\.github$
checkpoints
.txt
.json
322 changes: 161 additions & 161 deletions .Rhistory
Original file line number Diff line number Diff line change
@@ -1,164 +1,3 @@
)
})
setwd("~/aifeducation")
test_that("training_baseline_only", {
expect_no_error(
classifier$train(
data_embeddings = current_embeddings,
data_targets = example_targets,
data_n_test_samples=2,
use_baseline=TRUE,
bsl_val_size=0.25,
use_bsc=FALSE,
bsc_methods=c("dbsmote"),
bsc_max_k=10,
bsc_val_size=0.25,
use_bpl=FALSE,
bpl_max_steps=2,
bpl_epochs_per_step=1,
bpl_dynamic_inc=FALSE,
bpl_balance=TRUE,
bpl_max=1.00,
bpl_anchor=1.00,
bpl_min=0.00,
bpl_weight_inc=0.02,
bpl_weight_start=0.00,
bpl_model_reset=FALSE,
epochs=2,
batch_size=32,
dir_checkpoint=testthat::test_path("test_data/tmp"),
trace=FALSE,
view_metrics=FALSE,
keras_trace=0,
n_cores=1)
)
})
#-------------------------------------------------------------------------------
bert_modeling<-TextEmbeddingModel$new(
model_name="roberta_embedding",
model_label="Text Embedding via RoBERTa",
model_version="0.0.1",
model_language="english",
method = "roberta",
max_length = 256,
chunks=4,
overlap=40,
aggregation="last",
model_dir=testthat::test_path(tmp_path))
tmp_path="test_data/roberta"
#-------------------------------------------------------------------------------
bert_modeling<-TextEmbeddingModel$new(
model_name="roberta_embedding",
model_label="Text Embedding via RoBERTa",
model_version="0.0.1",
model_language="english",
method = "roberta",
max_length = 256,
chunks=4,
overlap=40,
aggregation="last",
model_dir=testthat::test_path(tmp_path))
test_that("train_tune_roberta_model", {
example_data<-data.frame(
id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id1,
label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
expect_no_error(
train_tune_roberta_model(output_dir=testthat::test_path("test_data/roberta"),
model_dir_path=testthat::test_path("test_data/roberta"),
raw_texts= example_data$text[1:5],
p_mask=0.30,
val_size=0.1,
n_epoch=1,
batch_size=1,
chunk_size=100,
n_workers=1,
multi_process=FALSE,
trace=FALSE))
})
#-------------------------------------------------------------------------------
bert_modeling<-TextEmbeddingModel$new(
model_name="roberta_embedding",
model_label="Text Embedding via RoBERTa",
model_version="0.0.1",
model_language="english",
method = "roberta",
max_length = 256,
chunks=4,
overlap=40,
aggregation="last",
model_dir=testthat::test_path(tmp_path))
test_that("creation_roberta", {
expect_s3_class(bert_modeling,
class="TextEmbeddingModel")
})
test_that("embedding_roberta", {
embeddings<-bert_modeling$embed(raw_text = example_data$text[1:10],
doc_id = example_data$id[1:10])
expect_s3_class(embeddings, class="EmbeddedText")
})
test_that("encoding_roberta", {
encodings<-bert_modeling$encode(raw_text = example_data$text[1:10],
token_encodings_only = TRUE)
expect_length(encodings,10)
expect_type(encodings,type="list")
})
example_data$text[1:10]
encodings<-bert_modeling$encode(raw_text = example_data$text[1:10],
token_encodings_only = TRUE)
expect_length(encodings,10)
encodings
devtools::load_all()
#-------------------------------------------------------------------------------
bert_modeling<-TextEmbeddingModel$new(
model_name="roberta_embedding",
model_label="Text Embedding via RoBERTa",
model_version="0.0.1",
model_language="english",
method = "roberta",
max_length = 256,
chunks=4,
overlap=40,
aggregation="last",
model_dir=testthat::test_path(tmp_path))
test_that("encoding_roberta", {
encodings<-bert_modeling$encode(raw_text = example_data$text[1:10],
token_encodings_only = TRUE)
expect_length(encodings,10)
expect_type(encodings,type="list")
})
devtools::load_all()
#-------------------------------------------------------------------------------
bert_modeling<-TextEmbeddingModel$new(
model_name="roberta_embedding",
model_label="Text Embedding via RoBERTa",
model_version="0.0.1",
model_language="english",
method = "roberta",
max_length = 256,
chunks=4,
overlap=40,
aggregation="last",
model_dir=testthat::test_path(tmp_path))
test_that("creation_roberta", {
expect_s3_class(bert_modeling,
class="TextEmbeddingModel")
})
test_that("embedding_roberta", {
embeddings<-bert_modeling$embed(raw_text = example_data$text[1:10],
doc_id = example_data$id[1:10])
expect_s3_class(embeddings, class="EmbeddedText")
})
devtools::load_all()
#-------------------------------------------------------------------------------
bert_modeling<-TextEmbeddingModel$new(
model_name="roberta_embedding",
model_label="Text Embedding via RoBERTa",
model_version="0.0.1",
model_language="english",
method = "roberta",
max_length = 256,
chunks=4,
overlap=40,
aggregation="last",
model_dir=testthat::test_path(tmp_path))
Expand Down Expand Up @@ -510,3 +349,164 @@ test_tracker_2
test_tracker_2$final_emissions
test_tracker_2$final_emissions_data
test_tracker_2$final_emissions_data$energy_consumed
conda_list()
reticulate::conda_list()
conda_binary()
reticulate::conda_binary()
devtools::load_all()
install_py_modules()
devtools::load_all()
install_py_modules()
reticulate::conda_list()
reticulate::use_condaenv("aifeduction")
reticulate::use_condaenv(condaenv="aifeducation")
aifeducation::check_aif_py_modules()
aifeducation::check_aif_py_modules(trace = TRUE)
devtools::load_all()
check_aif_py_modules()
example_data<-data.frame(
id=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$id2,
label=quanteda::docvars(quanteda.textmodels::data_corpus_moviereviews)$sentiment)
example_data$text<-as.character(quanteda.textmodels::data_corpus_moviereviews)
example_data$label[c(1:500,1001:1750)]=NA
example_data<-example_data[c(501:600,1001:1100,1751:1850),]
example_targets<-as.factor(example_data$label)
names(example_targets)=example_data$id
get_folds(target=example_data,k_folds=2)
get_folds(target=example_targets,k_folds=2)
target=example_targets
k_folds=2
sample_target=na.omit(target)
freq_cat=table(sample_target)
categories=names(freq_cat)
min_freq=min(freq_cat)
if(min_freq/k_folds<1){
fin_k_folds=min_freq
warning(paste("Frequency of the smallest category/label is not sufficent to ensure
at least 1 cases per fold. Adjusting number of folds from ",k_folds,"to",fin_k_folds,"."))
if(fin_k_folds==0){
stop("Frequency of the smallest category/label is to low. Please check your data.
Consider to remove all categories/labels with a very low absolute frequency.")
}
} else {
fin_k_folds=k_folds
}
fin_k_folds
final_assignments=NULL
for(cat in categories){
condition=(sample_target==cat)
focused_targets=subset(x = sample_target,
subset = condition)
n_cases=length(focused_targets)
cases_per_fold=vector(length = fin_k_folds)
cases_per_fold[]=ceiling(n_cases/fin_k_folds)
delta=sum(cases_per_fold)-n_cases
for(i in 1:delta){
cases_per_fold[1+(i-1)%%fin_k_folds]=cases_per_fold[1+(i-1)%%fin_k_folds]-1
}
possible_assignments=NULL
for(i in 1:length(cases_per_fold))
possible_assignments=append(
x=possible_assignments,
values=rep.int(x=i,
times = cases_per_fold[i])
)
assignments<-sample(
x=possible_assignments,
size=length(possible_assignments),
replace = FALSE
)
names(assignments)=names(focused_targets)
final_assignments=append(x=final_assignments,
values=assignments)
}
final_assignments=NULL
categories
categories="neg"
condition=(sample_target==cat)
focused_targets=subset(x = sample_target,
subset = condition)
n_cases=length(focused_targets)
cases_per_fold=vector(length = fin_k_folds)
cases_per_fold[]=ceiling(n_cases/fin_k_folds)
delta=sum(cases_per_fold)-n_cases
delta
final_assignments=NULL
for(cat in categories){
condition=(sample_target==cat)
focused_targets=subset(x = sample_target,
subset = condition)
n_cases=length(focused_targets)
cases_per_fold=vector(length = fin_k_folds)
cases_per_fold[]=ceiling(n_cases/fin_k_folds)
delta=sum(cases_per_fold)-n_cases
if(detla>0){
for(i in 1:delta){
cases_per_fold[1+(i-1)%%fin_k_folds]=cases_per_fold[1+(i-1)%%fin_k_folds]-1
}
}
possible_assignments=NULL
for(i in 1:length(cases_per_fold))
possible_assignments=append(
x=possible_assignments,
values=rep.int(x=i,
times = cases_per_fold[i])
)
assignments<-sample(
x=possible_assignments,
size=length(possible_assignments),
replace = FALSE
)
names(assignments)=names(focused_targets)
final_assignments=append(x=final_assignments,
values=assignments)
}
final_assignments=NULL
for(cat in categories){
condition=(sample_target==cat)
focused_targets=subset(x = sample_target,
subset = condition)
n_cases=length(focused_targets)
cases_per_fold=vector(length = fin_k_folds)
cases_per_fold[]=ceiling(n_cases/fin_k_folds)
delta=sum(cases_per_fold)-n_cases
if(delta>0){
for(i in 1:delta){
cases_per_fold[1+(i-1)%%fin_k_folds]=cases_per_fold[1+(i-1)%%fin_k_folds]-1
}
}
possible_assignments=NULL
for(i in 1:length(cases_per_fold))
possible_assignments=append(
x=possible_assignments,
values=rep.int(x=i,
times = cases_per_fold[i])
)
assignments<-sample(
x=possible_assignments,
size=length(possible_assignments),
replace = FALSE
)
names(assignments)=names(focused_targets)
final_assignments=append(x=final_assignments,
values=assignments)
}
val_sample=NULL
for(i in 1:fin_k_folds){
condition=(final_assignments==i)
val_sample[i]=list(names(subset(x=final_assignments,
subset=condition)))
}
devtools::build()
devtools::document()
devtools::build()
devtools::build()
devtools::document()
devtools::document()
devtools::build()
devtools::build()
devtools::build()
devtools::build()
devtools::load_all()
devtools::build()
devtools::build()
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ importFrom(quanteda,tokens_remove)
importFrom(quanteda,tokens_replace)
importFrom(quanteda,tokens_tolower)
importFrom(quanteda.textstats,textstat_simil)
importFrom(reticulate,conda_create)
importFrom(reticulate,py_install)
importFrom(stats,na.omit)
importFrom(stringr,str_length)
importFrom(stringr,str_remove_all)
Expand Down
2 changes: 1 addition & 1 deletion R/install_and_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#'@param envname \code{string} Name of the environment where the packages should
#'be installed.
#'@importFrom reticulate conda_create
#'#'@importFrom reticulate py_install
#'@importFrom reticulate py_install
#'@export
install_py_modules<-function(envname="aifeducation"){
relevant_modules<-c("transformers",
Expand Down
Loading

0 comments on commit ab736c1

Please sign in to comment.