diff --git a/Jenkinsfile b/Jenkinsfile index aebbb0aca21a..f4d68cbd91dd 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -734,7 +734,7 @@ pipeline { exp_manager=null' } } - stage('Test Restore with AlBERT') { + stage('Test Restore Punctuation & Capitalization with AlBERT') { steps { sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ @@ -752,7 +752,7 @@ pipeline { rm -rf "${data_dir}"' } } - stage('Test Restore with RoBERTa') { + stage('Test Restore Punctuation & Capitalization with RoBERTa') { steps { sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ @@ -763,7 +763,7 @@ pipeline { +model.test_ds.use_cache=false \ ~model.train_ds \ ~model.validation_ds \ - model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation/ \ + model.test_ds.ds_item="${data_dir}" \ trainer.devices=[1] \ trainer.accelerator="gpu" \ exp_manager=null && \ @@ -1593,17 +1593,23 @@ pipeline { stage('Punctuation & Capitalization, Using model.common_datasest_parameters.label_vocab_dir') { steps { sh 'cd examples/nlp/token_classification && \ - label_vocab_dir=label_vocab_dir && \ + work_dir="$(mktemp -d -p "$(pwd)")" && \ + label_vocab_dir="${work_dir}/labels" && \ mkdir -p ${label_vocab_dir} && \ + data_dir="${work_dir}/data" && \ + mkdir -p "${data_dir}" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ + output_dir="${work_dir}/output" && \ + mkdir -p "${output_dir}" && \ punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ printf "O\nU\n" > "${capit_label_vocab}" && \ - CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \ + python punctuation_capitalization_train_evaluate.py \ model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ - model.validation_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ - model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ + model.train_ds.ds_item="${data_dir}" \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ model.language_model.pretrained_model_name=distilbert-base-uncased \ model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ @@ -1614,15 +1620,15 @@ pipeline { trainer.devices=[0,1] \ trainer.strategy=ddp \ trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=/home/TestData/nlp/token_classification_punctuation/output \ + +exp_manager.explicit_log_dir="${output_dir}" \ +do_testing=false && \ - CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \ + python punctuation_capitalization_train_evaluate.py \ +do_training=false \ +do_testing=true \ ~model.train_ds \ ~model.validation_ds \ - model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ - pretrained_model=/home/TestData/nlp/token_classification_punctuation/output/checkpoints/Punctuation_and_Capitalization.nemo \ + model.test_ds.ds_item="${data_dir}" \ + pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ +model.train_ds.use_cache=false \ +model.validation_ds.use_cache=false \ +model.test_ds.use_cache=false \ @@ -1630,27 +1636,31 @@ pipeline { trainer.strategy=ddp \ trainer.max_epochs=1 \ exp_manager=null && \ - rm -r "${label_vocab_dir}" && \ - rm -rf /home/TestData/nlp/token_classification_punctuation/output/*' + rm -rf "${work_dir}"' } } stage('Punctuation & Capitalization, Using model.common_datasest_parameters.{punct,capit}_label_ids') { steps { sh 'cd examples/nlp/token_classification && \ - conf_path=/home/TestData/nlp/token_classification_punctuation && \ + work_dir="$(mktemp -d -p "$(pwd)")" && \ + output_dir="${work_dir}/output" && \ + mkdir -p "${output_dir}" && \ + data_dir="${work_dir}/data" && \ + mkdir -p "${data_dir}" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ conf_name=punctuation_capitalization_config_with_ids && \ - cp conf/punctuation_capitalization_config.yaml "${conf_path}/${conf_name}.yaml" && \ + cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \ sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \ - "${conf_path}/${conf_name}.yaml" && \ + "${work_dir}/${conf_name}.yaml" && \ sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \ - "${conf_path}/${conf_name}.yaml" && \ - CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \ - --config-path "${conf_path}" \ + "${work_dir}/${conf_name}.yaml" && \ + python punctuation_capitalization_train_evaluate.py \ + --config-path "${work_dir}" \ --config-name "${conf_name}" \ model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ - model.validation_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ - model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ + model.train_ds.ds_item="${data_dir}" \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ model.language_model.pretrained_model_name=distilbert-base-uncased \ +model.train_ds.use_cache=false \ +model.validation_ds.use_cache=false \ @@ -1658,15 +1668,15 @@ pipeline { trainer.devices=[0,1] \ trainer.strategy=ddp \ trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=/home/TestData/nlp/token_classification_punctuation/output \ + +exp_manager.explicit_log_dir="${output_dir}" \ +do_testing=false && \ - CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \ + python punctuation_capitalization_train_evaluate.py \ +do_training=false \ +do_testing=true \ ~model.train_ds \ ~model.validation_ds \ - model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \ - pretrained_model=/home/TestData/nlp/token_classification_punctuation/output/checkpoints/Punctuation_and_Capitalization.nemo \ + model.test_ds.ds_item="${data_dir}" \ + pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ +model.train_ds.use_cache=false \ +model.validation_ds.use_cache=false \ +model.test_ds.use_cache=false \ @@ -1674,8 +1684,7 @@ pipeline { trainer.strategy=ddp \ trainer.max_epochs=1 \ exp_manager=null && \ - rm -rf /home/TestData/nlp/token_classification_punctuation/output/* && \ - rm "${conf_path}/${conf_name}.yaml"' + rm -rf "${work_dir}"' } } } diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css index 305138c67d87..a071bc424c60 100644 --- a/docs/source/_static/css/custom.css +++ b/docs/source/_static/css/custom.css @@ -58,8 +58,19 @@ a:visited margin-left: unset; } +section { + overflow-x: auto; +} /* ----------------------------------------------TABLES--------------------------------------- */ +section table { + overflow-x: auto; + display: block; +} + +table { + font-size: small; +} /* Table head Color */ thead td diff --git a/docs/source/asr/data/scores/ca/quartznet15x5_ca.csv b/docs/source/asr/data/scores/ca/quartznet15x5_ca.csv new file mode 100644 index 000000000000..1082d5c3d749 --- /dev/null +++ b/docs/source/asr/data/scores/ca/quartznet15x5_ca.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (ca) +stt_ca_quartznet15x5,ca,6.0 diff --git a/docs/source/asr/data/scores/de/citrinet_de.csv b/docs/source/asr/data/scores/de/citrinet_de.csv new file mode 100644 index 000000000000..1b3e7db093a2 --- /dev/null +++ b/docs/source/asr/data/scores/de/citrinet_de.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_citrinet_1024,de,,6.63,7.59,4.06,5.07,12.33,10.02 diff --git a/docs/source/asr/data/scores/de/conformer_de.csv b/docs/source/asr/data/scores/de/conformer_de.csv new file mode 100644 index 000000000000..3d0a9e18d452 --- /dev/null +++ b/docs/source/asr/data/scores/de/conformer_de.csv @@ -0,0 +1,3 @@ +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_conformer_ctc_large,de,,5.84,6.68,3.85,4.63,12.56,10.51 +stt_de_conformer_transducer_large,de,,4.75,5.36,3.46,4.19,11.21,9.14 diff --git a/docs/source/asr/data/scores/de/contextnet_de.csv b/docs/source/asr/data/scores/de/contextnet_de.csv new file mode 100644 index 000000000000..b7d52d649e73 --- /dev/null +++ b/docs/source/asr/data/scores/de/contextnet_de.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_contextnet_1024,de,,4.76,5.5,3.53,4.2,11.32,9.4 diff --git a/docs/source/asr/data/scores/de/quartznet15x5_de.csv b/docs/source/asr/data/scores/de/quartznet15x5_de.csv new file mode 100644 index 000000000000..17540903f41e --- /dev/null +++ b/docs/source/asr/data/scores/de/quartznet15x5_de.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_quartznet15x5,de,11.78,,,,,, diff --git a/docs/source/asr/data/scores/en/citrinet_en.csv b/docs/source/asr/data/scores/en/citrinet_en.csv new file mode 100644 index 000000000000..42d8cff2cb9b --- /dev/null +++ b/docs/source/asr/data/scores/en/citrinet_en.csv @@ -0,0 +1,7 @@ +Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 +stt_en_citrinet_256,en,4.2 % WER,10.7 % WER,4.4 % WER,10.7 % WER,,,,,,,,, +stt_en_citrinet_512,en,3.7 % WER,8.9 % WER,3.7 % WER,8.9 % WER,,,,,,,,, +stt_en_citrinet_1024,en,3.7 % WER,8.3 % WER,3.6 % WER,7.9 % WER,,,,,,,,, +stt_en_citrinet_256_gamma_0_25,en,4.7 %,10.6 %,4.8 %,10.7 %,,,,8.3 %,,,,5.8 %,3.6 % +stt_en_citrinet_512_gamma_0_25,en,4.0 %,9.0 %,3.9 %,9.0 %,,,,6.9 %,,,,4.4 %,3.6 % +stt_en_citrinet_1024_gamma_0_25,en,3.4 %,7.7 %,3.4 %,7.6 %,,,,6.2 %,,,,4.0 %,2.5 % diff --git a/docs/source/asr/data/scores/en/conformer_en.csv b/docs/source/asr/data/scores/en/conformer_en.csv new file mode 100644 index 000000000000..57a8ad69b0b2 --- /dev/null +++ b/docs/source/asr/data/scores/en/conformer_en.csv @@ -0,0 +1,14 @@ +Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 +stt_en_conformer_ctc_small,en,3.6,8.1,3.7,8.1,,,,,,,,, +stt_en_conformer_ctc_medium,en,2.5,5.8,2.6,5.9,,,,,,,,, +stt_en_conformer_ctc_large,en,2.0,4.4,2.1,4.3,,,,,,,,, +stt_en_conformer_ctc_xlarge,en,1.77 %,3.79 %,2.00 %,3.74 %,7.88 %,,5.99 %,,6.44 %,22.90 %,5.50 %,2.36 %, +stt_en_conformer_ctc_small_ls,en,3.3,8.8,3.4,8.8,,,,,,,,, +stt_en_conformer_ctc_medium_ls,en,2.7,7.4,3.0,7.3,,,,,,,,, +stt_en_conformer_ctc_large_ls,en,2.4,6.2,2.7,6.0,,,,,,,,, +stt_en_conformer_transducer_small,en,2.8,6.6,2.5,6.6,,,,,,,,, +stt_en_conformer_transducer_medium,en,2.0,4.6,2.1,4.7,,,,,,,,, +stt_en_conformer_transducer_large,en,1.5,3.5,1.7,3.6,,,,,,,,, +stt_en_conformer_transducer_large_ls,en,2.1,5.0,2.3,5.1,,,,,,,,, +stt_en_conformer_transducer_xlarge,en,1.48 %,2.95 %,1.62 %,3.01 %,6.46 %,4.59 %,5.32 %,5.70 %,6.47 %,21.32 %,,2.05 %,1.17 % +stt_en_conformer_transducer_xxlarge,en,1.52 %,3.09 %,1.72 %,3.14 %,,5.29 %,5.85 %,6.64 %,,,,2.42 %,1.49 % diff --git a/docs/source/asr/data/scores/en/contextnet_en.csv b/docs/source/asr/data/scores/en/contextnet_en.csv new file mode 100644 index 000000000000..4a065dd299f8 --- /dev/null +++ b/docs/source/asr/data/scores/en/contextnet_en.csv @@ -0,0 +1,7 @@ +Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 +stt_en_contextnet_256,en,3.3 %,7.9 %,3.3 %,8.0 %,,9.7 %,11.0 %,7.1 %,,,,4.6 %,3.2 % +stt_en_contextnet_512,en,2.0 %,4.8 %,2.2 %,5.0 %,,6.6 %,7.3 %,5.9 %,,,,2.8 %,1.4 % +stt_en_contextnet_1024,en,1.7 %,3.8 %,1.9 %,4.0 %,7.9 %,,5.9 %,5.2 %,6.5 %,21.7 %,4.7 %,2.3 %,1.3 % +stt_en_contextnet_256_mls,en,,9.0 %,,9.2 %,,9.4 %,10.9 %,,,,,, +stt_en_contextnet_512_mls,en,,5.2 %,,5.2 %,,5.6 %,6.6 %,,,,,, +stt_en_contextnet_1024_mls,en,,4.1 %,,4.2 %,,4.6 %,5.6 %,,,,,, diff --git a/docs/source/asr/data/scores/en/jasper10x5dr_en.csv b/docs/source/asr/data/scores/en/jasper10x5dr_en.csv new file mode 100644 index 000000000000..ac9b260c5bb3 --- /dev/null +++ b/docs/source/asr/data/scores/en/jasper10x5dr_en.csv @@ -0,0 +1,2 @@ +Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 +stt_en_jasper10x5dr,en,3.74,10.21,,,,,,,,,,, diff --git a/docs/source/asr/data/scores/en/quartznet15x5_en.csv b/docs/source/asr/data/scores/en/quartznet15x5_en.csv new file mode 100644 index 000000000000..04aef4aa49dd --- /dev/null +++ b/docs/source/asr/data/scores/en/quartznet15x5_en.csv @@ -0,0 +1,2 @@ +Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 +stt_en_quartznet15x5,en,4.38,11.3,,,,,,,,,,, diff --git a/docs/source/asr/data/scores/enes/conformer_enes.csv b/docs/source/asr/data/scores/enes/conformer_enes.csv new file mode 100644 index 000000000000..9e3cad59944c --- /dev/null +++ b/docs/source/asr/data/scores/enes/conformer_enes.csv @@ -0,0 +1,3 @@ +Model Name,Language,Fisher-Dev-Es,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Dev-Set v7.0 (en),MLS Dev (es),VoxPopuli Dev (es) +stt_enes_conformer_ctc_large,enes,16.7 %,2.2 %,5.5 %,2.6 %,5.5 %,5.8 %,3.5 %,5.7 % +stt_enes_conformer_transducer_large,enes,16.2 %,2.0 %,4.6 %,2.2 %,4.6 %,5.0 %,3.3 %,5.3 % diff --git a/docs/source/asr/data/scores/enes/contextnet_enes.csv b/docs/source/asr/data/scores/enes/contextnet_enes.csv new file mode 100644 index 000000000000..694820ac1b88 --- /dev/null +++ b/docs/source/asr/data/scores/enes/contextnet_enes.csv @@ -0,0 +1,2 @@ +Model Name,Language,Fisher-Dev-Es,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Dev-Set v7.0 (en),MLS Dev (es),VoxPopuli Dev (es) +stt_enes_contextnet_large,enes,14.8 %,2.2 %,5.6 %,2.3 %,5.5 %,4.7 %,3.0 %,5.0 % diff --git a/docs/source/asr/data/scores/es/citrinet_es.csv b/docs/source/asr/data/scores/es/citrinet_es.csv new file mode 100644 index 000000000000..9311fb2b04fd --- /dev/null +++ b/docs/source/asr/data/scores/es/citrinet_es.csv @@ -0,0 +1,3 @@ +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_citrinet_512,es,,,,,,9.1 % WER,,10.3 % WER,,4.9 % WER,5.2 % WER,, +stt_es_citrinet_1024_gamma_0_25,es,19.9 %,21.3 %,19.1 %,15.8 %,15.9 %,,6.1 %,,6.8 %,3.5 %,4.1 %,5.6 %,7.0 % diff --git a/docs/source/asr/data/scores/es/conformer_es.csv b/docs/source/asr/data/scores/es/conformer_es.csv new file mode 100644 index 000000000000..10b28dc49f4e --- /dev/null +++ b/docs/source/asr/data/scores/es/conformer_es.csv @@ -0,0 +1,3 @@ +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_conformer_ctc_large,es,23.7 %,25.3 %,22.4 %,18.3 %,18.5 %,,6.3 %,,6.9 %,4.3 %,4.2 %,6.1 %,7.5 % +stt_es_conformer_transducer_large,es,18.0 %,19.4 %,17.2 %,14.7 %,14.8 %,,4.6 %,,5.2 %,2.7 %,3.2 %,4.7 %,6.0 % diff --git a/docs/source/asr/data/scores/es/contextnet_es.csv b/docs/source/asr/data/scores/es/contextnet_es.csv new file mode 100644 index 000000000000..ec20b5708d93 --- /dev/null +++ b/docs/source/asr/data/scores/es/contextnet_es.csv @@ -0,0 +1,2 @@ +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_contextnet_1024,es,19.1 %,20.7 %,18.2 %,15.3 %,15.1 %,,4.8 %,,5.2 %,3.1 %,3.5 %,5.1 %,6.2 % diff --git a/docs/source/asr/data/scores/es/quartznet15x5_es.csv b/docs/source/asr/data/scores/es/quartznet15x5_es.csv new file mode 100644 index 000000000000..79de5ce952d8 --- /dev/null +++ b/docs/source/asr/data/scores/es/quartznet15x5_es.csv @@ -0,0 +1,2 @@ +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_quartznet15x5,es,,,,,,12.97,,,,,,, diff --git a/docs/source/asr/data/scores/fr/citrinet_fr.csv b/docs/source/asr/data/scores/fr/citrinet_fr.csv new file mode 100644 index 000000000000..651dcb849440 --- /dev/null +++ b/docs/source/asr/data/scores/fr/citrinet_fr.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) +stt_fr_citrinet_1024_gamma_0_25,fr,,10.76,9.90,12.20,11.11,6.66,6.19,5.53,5.12 diff --git a/docs/source/asr/data/scores/fr/conformer_fr.csv b/docs/source/asr/data/scores/fr/conformer_fr.csv new file mode 100644 index 000000000000..8f74dfe8cae0 --- /dev/null +++ b/docs/source/asr/data/scores/fr/conformer_fr.csv @@ -0,0 +1,3 @@ +Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) +stt_fr_conformer_ctc_large,fr,,8.35,7.88,9.63,9.01,5.88,5.90,4.91,4.63 +stt_fr_conformer_transducer_large,fr,,6.85,,7.95,,5.05,,4.10, diff --git a/docs/source/asr/data/scores/fr/contextnet_fr.csv b/docs/source/asr/data/scores/fr/contextnet_fr.csv new file mode 100644 index 000000000000..71f601871d15 --- /dev/null +++ b/docs/source/asr/data/scores/fr/contextnet_fr.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) +stt_fr_contextnet_1024,fr,,8.32,,9.42,,6.02,,5.01, diff --git a/docs/source/asr/data/scores/fr/quartznet15x5_fr.csv b/docs/source/asr/data/scores/fr/quartznet15x5_fr.csv new file mode 100644 index 000000000000..a30f447f4281 --- /dev/null +++ b/docs/source/asr/data/scores/fr/quartznet15x5_fr.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) +stt_fr_quartznet15x5,fr,14.01,,,,,,,, diff --git a/docs/source/asr/data/scores/it/quartznet15x5_it.csv b/docs/source/asr/data/scores/it/quartznet15x5_it.csv new file mode 100644 index 000000000000..2731c4cfb895 --- /dev/null +++ b/docs/source/asr/data/scores/it/quartznet15x5_it.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (it) +stt_it_quartznet15x5,it,15.22 diff --git a/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv b/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv new file mode 100644 index 000000000000..5692e36037ac --- /dev/null +++ b/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (pl) +stt_pl_quartznet15x5,pl,14 diff --git a/docs/source/asr/data/scores/ru/quartznet15x5_ru.csv b/docs/source/asr/data/scores/ru/quartznet15x5_ru.csv new file mode 100644 index 000000000000..b1a7286f1b16 --- /dev/null +++ b/docs/source/asr/data/scores/ru/quartznet15x5_ru.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (ru) +stt_ru_quartznet15x5,ru,16.23 diff --git a/docs/source/asr/data/scores/zh/citrinet_zh.csv b/docs/source/asr/data/scores/zh/citrinet_zh.csv new file mode 100644 index 000000000000..2ad05e0233e1 --- /dev/null +++ b/docs/source/asr/data/scores/zh/citrinet_zh.csv @@ -0,0 +1,3 @@ +Model Name,Language,AIShell Dev-Android v2,AIShell Dev-Ios v1,AIShell Dev-Ios v2,AIShell Dev-Mic v2,AIShell Test-Android v2,AIShell Test-Ios v1,AIShell Test-Ios v2,AIShell Test-Mic v2 +stt_zh_citrinet_512,zh,,6.25%,,,,6.44%,, +stt_zh_citrinet_1024_gamma_0_25,zh,5.2 %,,4.8 %,5.2 %,5.5 %,,5.1 %,5.5 % diff --git a/docs/source/asr/data/scores/zh/conformer_zh.csv b/docs/source/asr/data/scores/zh/conformer_zh.csv new file mode 100644 index 000000000000..8d0ef96dc8d9 --- /dev/null +++ b/docs/source/asr/data/scores/zh/conformer_zh.csv @@ -0,0 +1,2 @@ +Model Name,Language,AIShell Dev-Android v2,AIShell Dev-Ios v1,AIShell Dev-Ios v2,AIShell Dev-Mic v2,AIShell Test-Android v2,AIShell Test-Ios v1,AIShell Test-Ios v2,AIShell Test-Mic v2 +stt_zh_conformer_transducer_large,zh,3.4,,3.2,3.4,3.4,,3.2,3.4 diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst index 9e2d76f19a87..eec1ea6692d6 100644 --- a/docs/source/asr/intro.rst +++ b/docs/source/asr/intro.rst @@ -37,6 +37,7 @@ The full documentation tree is as follows: datasets asr_language_modeling results + scores configs api resources diff --git a/docs/source/asr/scores.rst b/docs/source/asr/scores.rst new file mode 100644 index 000000000000..9eef95fa92b5 --- /dev/null +++ b/docs/source/asr/scores.rst @@ -0,0 +1,211 @@ +.. + AUTOGENERATED DOC: DO NOT EDIT MANUALLY ! + +Scores +------ + +EN +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/en/citrinet_en.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/en/conformer_en.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/en/contextnet_en.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/en/jasper10x5dr_en.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/en/quartznet15x5_en.csv + +-------------------- + +CA +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/ca/quartznet15x5_ca.csv + +-------------------- + +DE +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/de/citrinet_de.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/de/conformer_de.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/de/contextnet_de.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/de/quartznet15x5_de.csv + +-------------------- + +ENES +^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/enes/conformer_enes.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/enes/contextnet_enes.csv + +-------------------- + +ES +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/es/citrinet_es.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/es/conformer_es.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/es/contextnet_es.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/es/quartznet15x5_es.csv + +-------------------- + +FR +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/fr/citrinet_fr.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/fr/conformer_fr.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/fr/contextnet_fr.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/fr/quartznet15x5_fr.csv + +-------------------- + +IT +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/it/quartznet15x5_it.csv + +-------------------- + +PL +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/pl/quartznet15x5_pl.csv + +-------------------- + +RU +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/ru/quartznet15x5_ru.csv + +-------------------- + +ZH +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/zh/citrinet_zh.csv + +-------------------- + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/zh/conformer_zh.csv + +-------------------- + diff --git a/examples/nlp/intent_slot_classification/data/assistant_utils.py b/examples/nlp/intent_slot_classification/data/assistant_utils.py deleted file mode 100644 index 8e9b451bfec1..000000000000 --- a/examples/nlp/intent_slot_classification/data/assistant_utils.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import shutil - -from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files -from nemo.utils import logging - - -def copy_input_files(infold): - """ - Put training files in convenient place for conversion to our format. - - Args: - infold: location of an original fold of the dataset (in the sense of k-fold cross validation) - """ - our_infold = infold + "/dataset" - - if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"): - logging.info("Input folders exists") - return - - logging.info(f"Copying files to input folder: {our_infold}") - os.makedirs(infold, exist_ok=True) - - old_infold = ( - infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1' - ) - if not os.path.exists(our_infold + "/trainset"): - shutil.copytree(old_infold + '/trainset', our_infold + '/trainset') - - if not os.path.exists(our_infold + "/testset"): - shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset') - - -def get_intents(infold): - """ Get list of intents from file names. """ - intents = [f[:-4] for f in os.listdir(infold)] - intents.sort() - logging.info(f'Found {len(intents)} intents') - - return intents - - -def get_intent_queries(infold, intent_names, mode): - """ Get list of queries with their corresponding intent number. """ - intent_queries = ['sentence\tlabel\n'] - - for index, intent in enumerate(intent_names): - queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() - for query in queries[1:]: - phrases = query.split(";") - intent_query = phrases[4][1:-1] + "\t" + str(index) - intent_queries.append(intent_query) - - return intent_queries - - -def get_slots(infold, modes): - """ - Find a list of unique slot types in training and testing data. - We use a single slot type name both for starting and continuation tokens (not using B-, I- notation). - """ - slots = set() - - for mode in modes: - path = f'{infold}/{mode}set' - for filename in os.listdir(path): - lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines() - for line in lines[1:]: - query = line.split(";")[3] - slot_phrases = re.findall('\[.*?\]', query) - for slot_phrase in slot_phrases: - slot = slot_phrase.split(" : ")[0][1:] - slots.add(slot) - - slots = sorted(slots) - slots.append("O") - logging.info(f'Found {len(slots)} slot types') - - return slots - - -def get_slot_queries(infold, slot_dict, mode, intent_names): - """ - Convert each word in a query to corresponding slot number. - Args: - infold: fold of the data - slot_dict: dict containing slot-names to positions - mode: train, validation or test - intent_names: list of intents - """ - slot_queries = [] - outside_slot = len(slot_dict) - 1 - - # keep the same order of files/queries as for intents - for intent in intent_names: - lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() - for line in lines[1:]: - slot_query = "" - query = line.split(";")[3] - words = query.split(" ") - current_slot = outside_slot - for word in words: - if word[0] == "[": - current_slot = slot_dict[word[1:]] - elif word[0] == ":": - continue - else: - slot_query += str(current_slot) + " " - if word[-1] == ']': - current_slot = outside_slot - - slot_queries.append(slot_query.strip()) - - return slot_queries - - -def process_assistant(infold, outfold, modes=['train', 'test']): - """ - https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes - about 25 thousand examples with 66 various multi-domain intents and 57 entity types. - """ - if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('robot', outfold)) - return outfold - - logging.info(f'Processing assistant commands dataset and store at {outfold}') - os.makedirs(outfold, exist_ok=True) - - # copy train/test files to the convenient directory to work with - copy_input_files(infold) - infold += "/dataset" - - # get list of intents from train folder (test folder supposed to be the same) - intent_names = get_intents(infold + "/trainset") - write_files(intent_names, f'{outfold}/dict.intents.csv') - - # get all train and test queries with their intent - for mode in modes: - intent_queries = get_intent_queries(infold, intent_names, mode) - write_files(intent_queries, f'{outfold}/{mode}.tsv') - - # get list of all unique slots in training and testing files - slot_types = get_slots(infold, modes) - write_files(slot_types, f'{outfold}/dict.slots.csv') - - # create files of slot queries - slot_dict = {k: v for v, k in enumerate(slot_types)} - for mode in modes: - slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names) - write_files(slot_queries, f'{outfold}/{mode}_slots.tsv') diff --git a/examples/nlp/intent_slot_classification/data/import_datasets.py b/examples/nlp/intent_slot_classification/data/import_datasets.py deleted file mode 100644 index 2468ed7927d2..000000000000 --- a/examples/nlp/intent_slot_classification/data/import_datasets.py +++ /dev/null @@ -1,289 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import shutil -from os.path import exists - -from assistant_utils import process_assistant - -from nemo.collections.nlp.data.data_utils.data_preprocessing import ( - DATABASE_EXISTS_TMP, - MODE_EXISTS_TMP, - create_dataset, - get_dataset, - get_vocab, - if_exist, -) -from nemo.utils import logging - - -def ids2text(ids, vocab): - """ - Map list of ids of words in utterance to utterance - """ - return ' '.join([vocab[int(id_)] for id_ in ids]) - - -def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): - """ - Process ATIS dataset found at https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk - Args: - infold: location for input fold of data - outfold: location for output fold of data - modes: dataset splits to process - do_lowercase: whether to lowercase the input utterances - """ - vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) - return outfold - logging.info(f'Processing ATIS dataset and storing at {outfold}.') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w', encoding='utf-8') - outfiles[mode].write('sentence\tlabel\n') - outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w', encoding='utf-8') - - queries = open(f'{infold}/atis.{mode}.query.csv', 'r', encoding='utf-8').readlines() - intents = open(f'{infold}/atis.{mode}.intent.csv', 'r', encoding='utf-8').readlines() - slots = open(f'{infold}/atis.{mode}.slots.csv', 'r', encoding='utf-8').readlines() - - for i, query in enumerate(queries): - sentence = ids2text(query.strip().split()[1:-1], vocab) - if do_lower_case: - sentence = sentence.lower() - outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') - slot = ' '.join(slots[i].strip().split()[1:-1]) - outfiles[mode + '_slots'].write(slot + '\n') - - shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') - shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') - for mode in modes: - outfiles[mode].close() - - -def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_split=0.1): - """ - Process snips dataset - Args: - infold: location for input fold of data - outfold: location for output fold of data - do_lowercase: whether to lowercase the input utterances - modes: dataset splits to process - dev_split: proportion of train samples to put into dev set - """ - if not os.path.exists(infold): - link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets' - raise ValueError(f'Data not found at {infold}. ' f'You may request to download the SNIPS dataset from {link}.') - - exist = True - for dataset in ['light', 'speak', 'all']: - if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold)) - else: - exist = False - if exist: - return outfold - - logging.info(f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.') - logging.info( - f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".' - ) - - os.makedirs(outfold, exist_ok=True) - - speak_dir = 'smart-speaker-en-close-field' - light_dir = 'smart-lights-en-close-field' - - light_files = [f'{infold}/{light_dir}/dataset.json'] - speak_files = [f'{infold}/{speak_dir}/training_dataset.json'] - speak_files.append(f'{infold}/{speak_dir}/test_dataset.json') - - light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split) - speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files) - - create_dataset(light_train, light_dev, light_slots, light_intents, do_lower_case, f'{outfold}/light') - create_dataset(speak_train, speak_dev, speak_slots, speak_intents, do_lower_case, f'{outfold}/speak') - create_dataset( - light_train + speak_train, - light_dev + speak_dev, - light_slots | speak_slots, - light_intents | speak_intents, - do_lower_case, - f'{outfold}/all', - ) - - -def process_jarvis_datasets( - infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False -): - """ - Process and convert Jarvis datasets into NeMo's BIO format - Args: - infold: location for input fold of data - outfold: location for output fold of data - modes: dataset splits to process - do_lowercase: whether to lowercase the input utterances - ignore_prev_intent: whether to include intent from previous turn in predicting intent of current turn - """ - dataset_name = "jarvis" - if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): - logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) - return outfold - - logging.info(f'Processing {dataset_name} dataset and storing at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - intents_list = {} - slots_list = {} - slots_list_all = {} - - outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w', encoding='utf-8') - outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w', encoding='utf-8') - - outfiles['dict_slots'].write('O\n') - slots_list["O"] = 0 - slots_list_all["O"] = 0 - - for mode in modes: - if if_exist(outfold, [f'{mode}.tsv']): - logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) - continue - - if not if_exist(infold, [f'{mode}.tsv']): - logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') - continue - - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w', encoding='utf-8') - outfiles[mode].write('sentence\tlabel\n') - outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w', encoding='utf-8') - - queries = open(f'{infold}/{mode}.tsv', 'r', encoding='utf-8').readlines() - - for i, query in enumerate(queries): - line_splits = query.strip().split("\t") - if len(line_splits) == 3: - intent_str, slot_tags_str, sentence = line_splits - else: - intent_str, sentence = line_splits - slot_tags_str = "" - - if intent_str not in intents_list: - intents_list[intent_str] = len(intents_list) - outfiles['dict_intents'].write(f'{intent_str}\n') - - if ignore_prev_intent: - start_token = 2 - else: - start_token = 1 - - if do_lower_case: - sentence = sentence.lower() - sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) - outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') - - slot_tags_list = [] - if slot_tags_str.strip(): - slot_tags = slot_tags_str.strip().split(",") - for st in slot_tags: - if not st.strip(): - continue - [start_i, end_i, slot_name] = st.strip().split(":") - slot_tags_list.append([int(start_i), int(end_i), slot_name]) - if slot_name not in slots_list: - slots_list[slot_name] = len(slots_list) - slots_list_all[f'B-{slot_name}'] = len(slots_list_all) - slots_list_all[f'I-{slot_name}'] = len(slots_list_all) - outfiles['dict_slots'].write(f'B-{slot_name}\n') - outfiles['dict_slots'].write(f'I-{slot_name}\n') - - slot_tags_list.sort(key=lambda x: x[0]) - slots = [] - processed_index = 0 - for tag_start, tag_end, tag_str in slot_tags_list: - if tag_start > processed_index: - words_list = sentence[processed_index:tag_start].strip().split() - slots.extend([str(slots_list_all['O'])] * len(words_list)) - words_list = sentence[tag_start:tag_end].strip().split() - slots.append(str(slots_list_all[f'B-{tag_str}'])) - slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) - processed_index = tag_end - - if processed_index < len(sentence): - words_list = sentence[processed_index:].strip().split() - slots.extend([str(slots_list_all['O'])] * len(words_list)) - - slots = slots[1:-1] - slot = ' '.join(slots) - outfiles[mode + '_slots'].write(slot + '\n') - - outfiles[mode + '_slots'].close() - outfiles[mode].close() - - outfiles['dict_slots'].close() - outfiles['dict_intents'].close() - - return outfold - - -if __name__ == "__main__": - # Parse the command-line arguments. - parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.") - parser.add_argument( - "--dataset_name", required=True, type=str, choices=['atis', 'snips', 'jarvis', 'assistant'], - ) - parser.add_argument( - "--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files' - ) - parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset') - parser.add_argument("--do_lower_case", action='store_true') - parser.add_argument( - "--ignore_prev_intent", - action='store_true', - help='ignores previous intent while importing datasets in jarvis\'s format', - ) - - args = parser.parse_args() - - dataset_name = args.dataset_name - source_dir = args.source_data_dir - target_dir = args.target_data_dir - - if not exists(source_dir): - raise FileNotFoundError(f"{source_dir} does not exist.") - - if dataset_name == 'atis': - process_atis(infold=source_dir, outfold=target_dir, do_lower_case=args.do_lower_case) - elif dataset_name == 'snips': - process_snips(infold=source_dir, outfold=target_dir, do_lower_case=args.do_lower_case) - elif dataset_name == 'jarvis': - process_jarvis_datasets( - infold=source_dir, - outfold=target_dir, - modes=["train", "test", "dev"], - do_lower_case=args.do_lower_case, - ignore_prev_intent=args.ignore_prev_intent, - ) - elif dataset_name == 'assistant': - process_assistant(infold=source_dir, outfold=target_dir) - else: - raise ValueError(f'Dataset {dataset_name} is not supported.') diff --git a/examples/nlp/token_classification/conf/token_classification_config.yaml b/examples/nlp/token_classification/conf/token_classification_config.yaml index 661029c49769..1a5135dfc31d 100644 --- a/examples/nlp/token_classification/conf/token_classification_config.yaml +++ b/examples/nlp/token_classification/conf/token_classification_config.yaml @@ -48,7 +48,7 @@ model: pad_label: 'O' ignore_extra_tokens: false ignore_start_end: false - use_cache: true + use_cache: false # shared among dataloaders num_workers: 2 pin_memory: false diff --git a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py b/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py index 71a8e393d6e3..faab45b64c6e 100644 --- a/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py +++ b/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py @@ -43,6 +43,10 @@ by default. You may update the config file from the file directly. The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. +Additional default parameters could be found in PunctuationCapitalizationDataConfigBase from +/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py, +use `+` to modify their values via command line, e.g.: `+model.train_ds.num_workers=2` + For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb *** Model training *** @@ -54,7 +58,8 @@ model.train_ds.labels_file= \ model.validation_ds.ds_item= \ model.validation_ds.text_file= \ - model.validation_ds.labels_file= + model.validation_ds.labels_file= \ + ~model.test_ds To use one of the pretrained versions of the model and finetune it, run: python punctuation_capitalization_train_evaluate.py \ @@ -64,7 +69,8 @@ model.train_ds.labels_file= \ model.validation_ds.ds_item= \ model.validation_ds.text_file= \ - model.validation_ds.labels_file= + model.validation_ds.labels_file= \ + ~model.test_ds pretrained_model - pretrained PunctuationCapitalization model from list_available_models() or path to a .nemo file, for example: punctuation_en_bert or model.nemo @@ -120,8 +126,11 @@ def main(cfg: DictConfig) -> None: model = PunctuationCapitalizationModel.from_pretrained(cfg.pretrained_model) else: raise ValueError( - f'Provide path to the pre-trained .nemo file or choose from ' - f'{PunctuationCapitalizationModel.list_available_models()}' + f'Config parameter `pretrained_model` should contain a path to the pre-trained .nemo file or a model ' + f'name from ' + f'{[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()]}. ' + f'Provided `pretrained_model="{cfg.pretrained_model}"` is neither a valid path, nor a valid model ' + f'name.' ) model.update_config_after_restoring_from_checkpoint( class_labels=cfg.model.class_labels, diff --git a/examples/tts/conf/hifigan/hifigan.yaml b/examples/tts/conf/hifigan/hifigan.yaml index 7cc24a5ae974..de2ce001318f 100644 --- a/examples/tts/conf/hifigan/hifigan.yaml +++ b/examples/tts/conf/hifigan/hifigan.yaml @@ -64,7 +64,7 @@ model: min_lr: 1e-5 warmup_ratio: 0.02 - max_steps: 25000000 + max_steps: 2500000 l1_loss_factor: 45 denoise_strength: 0.0025 diff --git a/examples/tts/conf/hifigan/hifigan_44100.yaml b/examples/tts/conf/hifigan/hifigan_44100.yaml index 3e7a00b81554..4db2566408e9 100644 --- a/examples/tts/conf/hifigan/hifigan_44100.yaml +++ b/examples/tts/conf/hifigan/hifigan_44100.yaml @@ -64,7 +64,7 @@ model: min_lr: 1e-5 warmup_ratio: 0.02 - max_steps: 25000000 + max_steps: 2500000 l1_loss_factor: 45 denoise_strength: 0.0025 diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index fa04afc9283d..15e91cc76e9e 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -396,7 +396,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: model = PretrainedModelInfo( pretrained_model_name="stt_de_citrinet_1024", description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_citrinet_1024", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_citrinet_1024/versions/1.3.2/files/stt_de_citrinet_1024.nemo", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_citrinet_1024/versions/1.5.0/files/stt_de_citrinet_1024.nemo", ) results.append(model) diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 443b323030da..2540bd941aea 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -157,8 +157,8 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: model = PretrainedModelInfo( pretrained_model_name="stt_fr_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_conformer_transducer_large/versions/1.5/files/stt_fr_conformer_transducer_large.nemo", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_transducer_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_transducer_large/versions/1.5/files/stt_fr_conformer_transducer_large.nemo", ) results.append(model) diff --git a/nemo/collections/asr/parts/utils/vad_utils.py b/nemo/collections/asr/parts/utils/vad_utils.py index f744f5e163e4..28008bccef80 100644 --- a/nemo/collections/asr/parts/utils/vad_utils.py +++ b/nemo/collections/asr/parts/utils/vad_utils.py @@ -29,6 +29,7 @@ from pyannote.core import Annotation, Segment from pyannote.metrics import detection from sklearn.model_selection import ParameterGrid +from tqdm import tqdm from nemo.collections.asr.models import EncDecClassificationModel from nemo.utils import logging @@ -78,11 +79,11 @@ def prepare_manifest(config: dict) -> str: } if config.get('num_workers') is not None and config['num_workers'] > 1: - p = multiprocessing.Pool(processes=config['num_workers']) - results = p.starmap(write_vad_infer_manifest, zip(input_list, repeat(args_func))) - p.close() + with multiprocessing.Pool(processes=config['num_workers']) as p: + inputs = zip(input_list, repeat(args_func)) + results = list(tqdm(p.imap(write_vad_infer_manifest_star, inputs), total=len(input_list))) else: - results = [write_vad_infer_manifest(input_el, args_func) for input_el in input_list] + results = [write_vad_infer_manifest(input_el, args_func) for input_el in tqdm(input_list)] if os.path.exists(manifest_vad_input): logging.info("The prepared manifest file exists. Overwriting!") @@ -97,6 +98,13 @@ def prepare_manifest(config: dict) -> str: return manifest_vad_input +def write_vad_infer_manifest_star(args): + """ + A workaround for tqdm with starmap of multiprocessing + """ + return write_vad_infer_manifest(*args) + + def write_vad_infer_manifest(file: dict, args_func: dict) -> list: """ Used by prepare_manifest. @@ -256,17 +264,24 @@ def generate_overlap_vad_seq( "smoothing_method": smoothing_method, } if num_workers is not None and num_workers > 1: - p = multiprocessing.Pool(processes=num_workers) - p.starmap(generate_overlap_vad_seq_per_file, zip(frame_filepathlist, repeat(per_args))) - p.close() - p.join() + with multiprocessing.Pool(processes=num_workers) as p: + inputs = zip(frame_filepathlist, repeat(per_args)) + results = list(tqdm(p.imap(generate_overlap_vad_seq_per_file_star, inputs), total=len(frame_filepathlist))) + else: - for frame_filepath in frame_filepathlist: + for frame_filepath in tqdm(frame_filepathlist): generate_overlap_vad_seq_per_file(frame_filepath, per_args) return overlap_out_dir +def generate_overlap_vad_seq_per_file_star(args): + """ + A workaround for tqdm with starmap of multiprocessing + """ + return generate_overlap_vad_seq_per_file(*args) + + @torch.jit.script def generate_overlap_vad_seq_per_tensor( frame: torch.Tensor, per_args: Dict[str, float], smoothing_method: str @@ -691,17 +706,24 @@ def generate_vad_segment_table( per_args = {**per_args, **postprocessing_params} if num_workers is not None and num_workers > 1: - p = multiprocessing.Pool(processes=num_workers) - p.starmap(generate_vad_segment_table_per_file, zip(vad_pred_filepath_list, repeat(per_args))) - p.close() - p.join() + with multiprocessing.Pool(num_workers) as p: + inputs = zip(vad_pred_filepath_list, repeat(per_args)) + list(tqdm(p.imap(generate_vad_segment_table_per_file_star, inputs), total=len(vad_pred_filepath_list))) + else: - for vad_pred_filepath in vad_pred_filepath_list: + for vad_pred_filepath in tqdm(vad_pred_filepath_list): generate_vad_segment_table_per_file(vad_pred_filepath, per_args) return table_out_dir +def generate_vad_segment_table_per_file_star(args): + """ + A workaround for tqdm with starmap of multiprocessing + """ + return generate_vad_segment_table_per_file(*args) + + def vad_construct_pyannote_object_per_file( vad_table_filepath: str, groundtruth_RTTM_file: str ) -> Tuple[Annotation, Annotation]: diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py index 4b9ff6d5b27e..5e0c290b8a12 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py @@ -31,7 +31,7 @@ import multiprocessing as mp import os import pickle -import random +import tempfile from dataclasses import dataclass from math import ceil from pathlib import Path @@ -975,13 +975,24 @@ def __init__( n_jobs=n_jobs, ) self.features_pkl.parent.mkdir(parents=True, exist_ok=True) - pickle.dump(tuple(list(features) + [punct_label_ids, capit_label_ids]), self.features_pkl.open("wb")) + + # save features to a temp file first to make sure that non-master processes don't start reading the file + # until the master process is done with writing + ofd, tmp_features_pkl = tempfile.mkstemp( + suffix='.pkl', prefix=os.path.basename(self.features_pkl), dir=os.path.dirname(self.features_pkl) + ) + with os.fdopen(ofd, 'wb') as temp_f: + pickle.dump(tuple(list(features) + [punct_label_ids, capit_label_ids]), temp_f) + + os.rename(tmp_features_pkl, self.features_pkl) + if self.verbose: logging.info(f'Features saved to {self.features_pkl}') # wait until the master process writes to the processed data files - if torch.distributed.is_initialized(): - torch.distributed.barrier() + if not master_device: + while features is None and not os.path.exists(self.features_pkl): + sleep(10) if features is None: features = pickle.load(self.features_pkl.open('rb')) diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py index 3a1d1f3c6df4..134f2e98b522 100644 --- a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py +++ b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py @@ -22,10 +22,11 @@ import os import pickle +import tempfile +import time from typing import Dict, List, Optional import numpy as np -import torch from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.collections.nlp.data.data_utils.data_preprocessing import get_stats @@ -266,12 +267,21 @@ def __init__( ignore_start_end=ignore_start_end, ) - pickle.dump(features, open(features_pkl, "wb")) + # save features to a temp file first to make sure that non-master processes don't start reading the file + # until the master process is done with writing + ofd, tmp_features_pkl = tempfile.mkstemp( + suffix='.pkl', prefix=os.path.basename(features_pkl), dir=os.path.dirname(features_pkl) + ) + with os.fdopen(ofd, 'wb') as temp_f: + pickle.dump(features, temp_f) + + os.rename(tmp_features_pkl, features_pkl) logging.info(f'features saved to {features_pkl}') # wait until the master process writes to the processed data files - if torch.distributed.is_initialized(): - torch.distributed.barrier() + if not master_device: + while features is None and not os.path.exists(features_pkl): + time.sleep(10) if features is None: features = pickle.load(open(features_pkl, 'rb')) diff --git a/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py b/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py index 8d441d25b4d6..5aff603ffabd 100644 --- a/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py +++ b/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py @@ -18,7 +18,6 @@ from typing import Dict, List, Optional, Union import torch -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from omegaconf import DictConfig from pytorch_lightning import Trainer from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq @@ -36,6 +35,13 @@ from nemo.core.neural_types import ChannelType, LabelsType, LossType, MaskType, NeuralType from nemo.utils import logging +try: + from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + + PYNINI_AVAILABLE = True +except (ModuleNotFoundError, ImportError) as e: + PYNINI_AVAILABLE = False + __all__ = ['DuplexDecoderModel'] @@ -98,6 +104,9 @@ def setup_cgs(self, cfg: DictConfig): input_case = 'cased' # input_case is cased by default if hasattr(self.tokenizer, 'do_lower_case') and self.tokenizer.do_lower_case: input_case = 'lower_cased' + + if not PYNINI_AVAILABLE: + raise ValueError(f"pynini not installed") self.cg_normalizer = NormalizerWithAudio(input_case=input_case, lang=self.lang) @typecheck() diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index 24260b2847f4..a0152f47dd84 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -649,6 +649,16 @@ def on_validation_epoch_end(self): data_parallel_size=parallel_state.get_data_parallel_world_size(), ) + def on_validation_epoch_start(self): + app_state = AppState() + _reconfigure_microbatch_calculator( + rank=app_state.global_rank, + rampup_batch_size=None, + global_batch_size=parallel_state.get_data_parallel_world_size(), + micro_batch_size=1, + data_parallel_size=parallel_state.get_data_parallel_world_size(), + ) + @torch.no_grad() def translate( self, diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py index 7b9531119fa7..5be5a2d3d46b 100644 --- a/nemo/collections/nlp/models/token_classification/token_classification_model.py +++ b/nemo/collections/nlp/models/token_classification/token_classification_model.py @@ -498,7 +498,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: result = [] model = PretrainedModelInfo( pretrained_model_name="ner_en_bert", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ner_en_bert/versions/1.0.0rc1/files/ner_en_bert.nemo", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ner_en_bert/versions/1.10/files/ner_en_bert.nemo", description="The model was trained on GMB (Groningen Meaning Bank) corpus for entity recognition and achieves 74.61 F1 Macro score.", ) result.append(model) diff --git a/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py b/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py index 4463d65705b8..8e9b451bfec1 100644 --- a/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py +++ b/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,12 @@ def copy_input_files(infold): - """ Put training files in convenient place for conversion to our format. """ + """ + Put training files in convenient place for conversion to our format. + + Args: + infold: location of an original fold of the dataset (in the sense of k-fold cross validation) + """ our_infold = infold + "/dataset" if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"): @@ -45,7 +50,8 @@ def get_intents(infold): """ Get list of intents from file names. """ intents = [f[:-4] for f in os.listdir(infold)] intents.sort() - print(f'Found {len(intents)} intents') + logging.info(f'Found {len(intents)} intents') + return intents @@ -65,8 +71,8 @@ def get_intent_queries(infold, intent_names, mode): def get_slots(infold, modes): """ - Find a lost of unique slot types in training and testing data. - We use a single slot type name both for starting and continuation tokes (not using B-, I- notation). + Find a list of unique slot types in training and testing data. + We use a single slot type name both for starting and continuation tokens (not using B-, I- notation). """ slots = set() @@ -83,12 +89,20 @@ def get_slots(infold, modes): slots = sorted(slots) slots.append("O") - print(f'Found {len(slots)} slot types') + logging.info(f'Found {len(slots)} slot types') + return slots def get_slot_queries(infold, slot_dict, mode, intent_names): - """ Convert each word in a query to corresponding slot number. """ + """ + Convert each word in a query to corresponding slot number. + Args: + infold: fold of the data + slot_dict: dict containing slot-names to positions + mode: train, validation or test + intent_names: list of intents + """ slot_queries = [] outside_slot = len(slot_dict) - 1 diff --git a/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py b/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py old mode 100755 new mode 100644 index bbbd54e97e05..2468ed7927d2 --- a/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py +++ b/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,12 +31,20 @@ def ids2text(ids, vocab): + """ + Map list of ids of words in utterance to utterance + """ return ' '.join([vocab[int(id_)] for id_ in ids]) def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): - """ MSFT's dataset, processed by Kaggle - https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk + """ + Process ATIS dataset found at https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk + Args: + infold: location for input fold of data + outfold: location for output fold of data + modes: dataset splits to process + do_lowercase: whether to lowercase the input utterances """ vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') @@ -72,6 +80,15 @@ def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_split=0.1): + """ + Process snips dataset + Args: + infold: location for input fold of data + outfold: location for output fold of data + do_lowercase: whether to lowercase the input utterances + modes: dataset splits to process + dev_split: proportion of train samples to put into dev set + """ if not os.path.exists(infold): link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets' raise ValueError(f'Data not found at {infold}. ' f'You may request to download the SNIPS dataset from {link}.') @@ -117,7 +134,14 @@ def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_s def process_jarvis_datasets( infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False ): - """ process and convert Jarvis datasets into NeMo's BIO format + """ + Process and convert Jarvis datasets into NeMo's BIO format + Args: + infold: location for input fold of data + outfold: location for output fold of data + modes: dataset splits to process + do_lowercase: whether to lowercase the input utterances + ignore_prev_intent: whether to include intent from previous turn in predicting intent of current turn """ dataset_name = "jarvis" if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb index aa602bed8dca..7119f7e2de0e 100644 --- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb +++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb @@ -172,7 +172,7 @@ "\n", "Here are some common guidelines we encourage (but do not enforce) users to follow : \n", "\n", - "- `Task name`: Usually a short 2-3 character represenation of the task that the model performs.\n", + "- `Task name`: Usually a short 2-3 character representation of the task that the model performs.\n", " - `stt` = Speech To Text (ASR)\n", " - `tts` = Text to Speech (TTS)\n", " - `ssl` = (Speech) Self Supervised Learning (SSL)\n", diff --git a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb index 1f92bdca205d..7917a5da2170 100644 --- a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb +++ b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb @@ -119,13 +119,13 @@ "# You can ignore it if run locally but do make sure change the filepaths of scripts and config file in cells below.\n", "!mkdir -p scripts\n", "if not os.path.exists(\"scripts/vad_infer.py\"):\n", - " !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/speech_classification/vad_infer.py\n", + " !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/speech_classification/vad_infer.py\n", "if not os.path.exists(\"scripts/transcribe_speech.py\"):\n", - " !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/transcribe_speech.py\n", + " !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/transcribe_speech.py\n", " \n", "!mkdir -p conf/vad\n", "if not os.path.exists(\"conf/vad/vad_inference_postprocessing.yaml\"):\n", - " !wget -P conf/vad/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/vad/vad_inference_postprocessing.yaml" + " !wget -P conf/vad/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/vad/vad_inference_postprocessing.yaml" ] }, { diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb index 604e16b5fd32..10bd5afd2b50 100644 --- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb +++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb @@ -1013,7 +1013,7 @@ "!python scripts/speech_to_text_eval.py \\\n", " model_path=\"/content/adapted_model.nemo\" \\\n", " dataset_manifest=$TEST_MANIFEST \\\n", - " output_filename=\"adapted_predictions.json\" \\\n", + " output_filename=\"/content/adapted_predictions.json\" \\\n", " batch_size=32 \\\n", " use_cer=False" ], @@ -1088,7 +1088,7 @@ "!python scripts/speech_to_text_eval.py \\\n", " model_path=\"/content/adapter_disabled_model.nemo\" \\\n", " dataset_manifest=$TEST_MANIFEST \\\n", - " output_filename=\"adapter_disabled_predictions.json\" \\\n", + " output_filename=\"/content/adapter_disabled_predictions.json\" \\\n", " batch_size=32 \\\n", " use_cer=False" ], diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb index 3c7288d93d05..ddd3bdd4f929 100644 --- a/tutorials/nlp/Dialogue.ipynb +++ b/tutorials/nlp/Dialogue.ipynb @@ -27,8 +27,9 @@ "outputs": [], "source": [ "import os \n", + "BRANCH = 'main'\n", "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n", - "!git clone https://github.com/NVIDIA/NeMo --branch r1.9.0\n", + "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n", "os.chdir('NeMo')\n", "!./reinstall.sh\n", "os.chdir('..')\n" @@ -107,7 +108,7 @@ "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n", "!unzip master.zip\n", "# convert the dataset to the NeMo format\n", - "!python NeMo/examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" + "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" ] }, { diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb index 8c69198565cc..26f82121410c 100644 --- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb +++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb @@ -105,8 +105,8 @@ "NEMO_DIR = '.'\n", "\n", "# download the converter files from github for the purpose of this tutorial\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/data/import_datasets.py', NEMO_DIR)\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/data/assistant_utils.py', NEMO_DIR)" + "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n", + "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)" ] }, { @@ -611,9 +611,9 @@ "\n", "\n", "# download the converter files from github for the purpose of this tutorial\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/data/import_datasets.py', NEMO_DIR)\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/data/assistant_utils.py', NEMO_DIR)\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/data/convert_datasets.py', NEMO_DIR)\n", + "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n", + "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)\n", + "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py', NEMO_DIR)\n", "\n", "# Get original atis dataset\n", "!python {NEMO_DIR}/import_datasets.py --dataset_name=atis --source_data_dir={DATA_DIR} --target_data_dir={DATA_DIR}/nemo_format\n", @@ -639,7 +639,7 @@ "outputs": [], "source": [ "# download the data augmentation script\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/data/augment_training_data.py', NEMO_DIR)" + "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/augment_training_data.py', NEMO_DIR)" ] }, { @@ -824,4 +824,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb index 3993b046b7d5..3dc3d6ce192e 100644 --- a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb +++ b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb @@ -637,7 +637,7 @@ "Open another Jupyter notebook or terminal, and run the following in a cell. \n", "```python\n", "!python megatron_gpt_eval.py \\\n", - " model_file=tabular.nemo \\\n", + " gpt_model_file=tabular.nemo \\\n", " prompts=[\\'\\',\\'\\'] \\\n", " server=True\n", "```\n", diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb index 05c1749fde2e..f4b8e66b3892 100644 --- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb +++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb @@ -848,7 +848,7 @@ "os.environ[\"RANK\"] = '0'\n", "os.environ[\"WORLD_SIZE\"] = '1'\n", "\n", - "plugins = [NLPDDPPlugin(find_unused_parameters=False), TorchElasticEnvironment()]\n", + "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n", "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n", "\n", "print(\"Trainer config - \\n\")\n", @@ -901,7 +901,7 @@ "source": [ "# Set some of the learning parameters\n", "config.model.optim.lr = 1e-4\n", - "config.model.batch_size = 16" + "config.model.precision = config.trainer.precision" ] }, { @@ -1009,7 +1009,9 @@ "cell_type": "code", "execution_count": null, "id": "74a5a358", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "response = model.generate(inputs=test_examples, length_params=None)\n", @@ -1032,15 +1034,27 @@ "We need to update:\n", "\n", "1. `name`\n", - "3. `model.restore_path`\n", - "5. `model.existing_tasks`\n", - "6. `model.new_tasks`\n", - "7. `model.data.train_ds`\n", - "8. `model.data.validation_ds`\n", + "2. `model.restore_path`\n", + "3. `model.existing_tasks`\n", + "4. `model.new_tasks`\n", + "5. `model.virtual_prompt_style`\n", + "6. `model.data.train_ds`\n", + "7. `model.data.validation_ds`\n", "\n", "Remember that we already set `task_templates` for SQuAD when we were defining the task template for the other two tasks. We would add it here if we had not already set it above." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5ec279d", + "metadata": {}, + "outputs": [], + "source": [ + "# Change the experiment name\n", + "config.name = 'squad_p_tuning'" + ] + }, { "cell_type": "markdown", "id": "6adb09a3", @@ -1052,13 +1066,10 @@ { "cell_type": "code", "execution_count": null, - "id": "b5ec279d", + "id": "2e196967", "metadata": {}, "outputs": [], "source": [ - "# Change the experiment name\n", - "config.name = 'squad_p_tuning'\n", - "\n", "# Change restore path from null to the p-tuned model we just finished training\n", "config.model.restore_path = \"multitask_p_tuned_gpt.nemo\"\n", "\n", @@ -1067,6 +1078,25 @@ "config.model.new_tasks = [\"squad\"]" ] }, + { + "cell_type": "markdown", + "id": "4dc088ec", + "metadata": {}, + "source": [ + "After the first round of p-tuning finished, the ``virtual_prompt_style`` got automatically set to ``inference`` at the end of training. This was done to make the prompt learning model ready as soon as training is complete. For the second round of p-tuning, we need to set ``virtual_prompt_style`` to ``p-tuning`` again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c49128a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Reset virtual prompt style to \"p-tuning\" from \"inference\"\n", + "config.model.virtual_prompt_style = \"p-tuning\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1102,18 +1132,40 @@ "# Limiting the number of validation batches for sake of time\n", "config.trainer.limit_val_batches = 100\n", "\n", + "# Adjust learning rate for the task\n", "config.model.optim.lr = 5e-4\n", - "config.model.optim.sched.min_lr = 1e-5\n", - "config.model.batch_size = 4\n", "\n", "# Reset the trainer\n", - "plugins = [NLPDDPPlugin(find_unused_parameters=False), TorchElasticEnvironment()]\n", + "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n", "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n", "\n", "print(\"Trainer config - \\n\")\n", "print(OmegaConf.to_yaml(config.trainer))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ac21b0c", + "metadata": {}, + "outputs": [], + "source": [ + "from apex.transformer import parallel_state\n", + "from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator\n", + "from nemo.utils import AppState\n", + "\n", + "app_state = AppState()\n", + "\n", + "# Need to reconfigure micro batch calculator with apex for new p-tuning session\n", + "_reconfigure_microbatch_calculator(\n", + " rank=app_state.global_rank,\n", + " rampup_batch_size=None,\n", + " global_batch_size=config.model.global_batch_size,\n", + " micro_batch_size=config.model.micro_batch_size,\n", + " data_parallel_size=parallel_state.get_data_parallel_world_size(),\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1145,7 +1197,7 @@ "execution_count": null, "id": "1b3d95f1", "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [], "source": [ diff --git a/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb b/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb old mode 100755 new mode 100644 index 0d5826bee4ea..443f0713a45c --- a/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb +++ b/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "OETcTQlcguCm" }, @@ -13,20 +13,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "o_0K1lsW1dj9" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting nemo_toolkit[nlp]\r\n", - "\u001b[31mERROR: The URL 'git+https://github.com/NVIDIA/NeMo.git@#egg=nemo_toolkit[nlp]' has an empty revision (after @) which is not supported. Include a revision after @ or remove @ from the URL.\u001b[0m\r\n" - ] - } - ], + "outputs": [], "source": [ "\"\"\"\n", "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", @@ -81,7 +72,8 @@ "from omegaconf import OmegaConf\n", "\n", "import zipfile\n", - "import random" + "import random\n", + "from glob import glob" ] }, { @@ -187,9 +179,16 @@ "# path to the folder with ArmanPersoNERCorpus.zip file (if running locally on in a docker)\n", "DATA_DIR = \"PATH_TO_FOLDER_WITH_ZIP.ZIP_FILE\"\n", "WORK_DIR = \"WORK_DIR\"\n", - "MODEL_CONFIG = \"token_classification_config.yaml\"\n", + "\n", + "# adding an empty subfolder for data (otherwise it can interact with existing folders in DATA_DIR)\n", + "subfolder = f\"{DATA_DIR}/non_eng_NER\"\n", + "\n", "os.makedirs(WORK_DIR, exist_ok=True)\n", - "os.makedirs(DATA_DIR, exist_ok=True)" + "os.makedirs(DATA_DIR, exist_ok=True)\n", + "os.makedirs(subfolder, exist_ok=True)\n", + "\n", + "! cp $DATA_DIR/ArmanPersoNERCorpus.zip $subfolder/.\n", + "DATA_DIR = f\"{DATA_DIR}/non_eng_NER\"" ] }, { @@ -234,7 +233,7 @@ }, "outputs": [], "source": [ - "! cd {DATA_DIR} && unzip \"ArmanPersoNERCorpus.zip\"" + "! cd $DATA_DIR && unzip \"ArmanPersoNERCorpus.zip\"" ] }, { @@ -256,10 +255,8 @@ "source": [ "file_all = os.path.join(DATA_DIR, \"all_data.txt\")\n", "with open(file_all, \"w\") as f1:\n", - " for filename in os.listdir(DATA_DIR):\n", - " if (filename == \"ReadMe.txt\" or filename == \"ArmanPersoNERCorpus.zip\" or filename == \"all_data.txt\"):\n", - " continue\n", - " with open(DATA_DIR + \"/\" + filename, \"r\", encoding = \"ISO-8859-1\") as f2:\n", + " for filename in glob(f\"{DATA_DIR}/test_fold*.txt\") + glob(f\"{DATA_DIR}/train_fold*.txt\"):\n", + " with open(filename, \"r\", encoding = \"ISO-8859-1\") as f2:\n", " for line in f2:\n", " f1.write(line)" ] @@ -288,7 +285,7 @@ }, "outputs": [], "source": [ - "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/token_classification/data/import_from_iob_format.py" + "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/nlp/token_classification/data/import_from_iob_format.py" ] }, { @@ -450,7 +447,7 @@ }, "outputs": [], "source": [ - "! ls -l {DATA_DIR}" + "! ls -l $DATA_DIR" ] }, { @@ -493,6 +490,7 @@ }, "outputs": [], "source": [ + "MODEL_CONFIG = \"token_classification_config.yaml\"\n", "# download the model's configuration file \n", "config_dir = WORK_DIR + '/configs/'\n", "os.makedirs(config_dir, exist_ok=True)\n", @@ -557,7 +555,8 @@ "config.model.validation_ds.num_samples = NUM_SAMPLES\n", "\n", "# for demonstartion purposes we're running only a single epoch\n", - "config.trainer.max_epochs = 5" + "config.trainer.max_epochs = 5\n", + "print(OmegaConf.to_yaml(config.model))" ] }, { @@ -610,6 +609,7 @@ "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n", "config.trainer.max_steps = 32\n", "\n", + "config.exp_manager.exp_dir = WORK_DIR\n", "trainer = pl.Trainer(**config.trainer)" ] }, @@ -624,6 +624,15 @@ "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_manager(trainer, config.get(\"exp_manager\", None))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -632,7 +641,7 @@ }, "outputs": [], "source": [ - "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n", + "exp_dir = config.exp_manager.exp_dir\n", "\n", "# the exp_dir provides a path to the current experiment for easy access\n", "exp_dir = str(exp_dir)\n", @@ -657,24 +666,12 @@ "outputs": [], "source": [ "# get the list of supported BERT-like models, for the complete list of HugginFace models, see https://huggingface.co/models\n", - "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=True))\n", + "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=False))\n", "\n", "# specify BERT-like model, you want to use\n", "PRETRAINED_BERT_MODEL = \"bert-base-multilingual-uncased\"" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RK2xglXyAUOO" - }, - "outputs": [], - "source": [ - "# add the specified above model parameters to the config\n", - "config.model.language_model.pretrained_model_name = PRETRAINED_BERT_MODEL" - ] - }, { "cell_type": "markdown", "metadata": { @@ -871,7 +868,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -885,7 +882,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.8.13" }, "pycharm": { "stem_cell": { diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb index d13cb9d6b582..07e354bd8cc4 100644 --- a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb +++ b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb @@ -386,7 +386,9 @@ "source": [ "Note in this tutorial, we use the VAD model MarbleNet-3x2 introduced and published in [ICASSP MarbleNet](https://arxiv.org/pdf/2010.13886.pdf). You might need to tune on dev set similar to your dataset if you would like to improve the performance.\n", "\n", - "And the speakerNet-M-Diarization model achieves 7.3% confusion error rate on CH109 set with oracle vad. This model is trained on voxceleb1, voxceleb2, Fisher, SwitchBoard datasets. So for more improved performance specific to your dataset, finetune speaker verification model with a devset similar to your test set." + "And the speakerNet-M-Diarization model achieves 7.3% confusion error rate on CH109 set with oracle vad. This model is trained on voxceleb1, voxceleb2, Fisher, SwitchBoard datasets. So for more improved performance specific to your dataset, finetune speaker verification model with a devset similar to your test set.\n", + "\n", + "It is recommended to set `num_workers=1` since using mulitprocessing package in Jupyter Notebook environment might cause freezing issues. For sizable data, run speaker diarization using the scripts in `NeMo/examples/speaker_tasks/` setting `num_workers` larger than 1 in the configurations." ] }, { @@ -395,6 +397,8 @@ "metadata": {}, "outputs": [], "source": [ + "config.num_workers = 1 # Workaround for multiprocessing hanging with ipython issue \n", + "\n", "output_dir = os.path.join(ROOT,'outputs')\n", "config.diarizer.manifest_filepath = 'data/input_manifest.json'\n", "config.diarizer.out_dir = output_dir #Directory to store intermediate files and prediction outputs\n", diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb index be021cb78212..0ad3f114d458 100644 --- a/tutorials/tts/Tacotron2_Training.ipynb +++ b/tutorials/tts/Tacotron2_Training.ipynb @@ -155,7 +155,22 @@ "# NeMo's training scripts are stored inside the examples/ folder. Let's grab the tacotron2.py file\n", "# as well as the tacotron2.yaml file\n", "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/tacotron2.py\n", - "!mkdir -p conf && cd conf && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml && cd .." + "!(mkdir -p conf \\\n", + " && cd conf \\\n", + " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml \\\n", + " && cd ..)\n", + "\n", + "# We will also need a few extra files for handling text.\n", + "!(mkdir -p scripts/tts_dataset_files \\\n", + " && cd scripts/tts_dataset_files \\\n", + " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + " && wget wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n", + " && cd ..)\n", + " \n", + "!(mkdir -p nemo_text_processing/text_normalization/en/data/whitelist/ \\\n", + " && cd nemo_text_processing/text_normalization/en/data/whitelist/ \\\n", + " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv \\\n", + " && cd ..)" ] }, { @@ -218,7 +233,7 @@ "\n", "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.01\"\n", "heteronyms_path: \"scripts/tts_dataset_files/heteronyms-030921\"\n", - "whitelist_path: \"nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv\"\n", + "whitelist_path: \"nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv\"\n", "```\n", "\n", "The first part of the yaml defines dataset parameters used by Tacotron. Then in the head of 'model' section there are processing - related parameters. You can see\n", @@ -257,14 +272,37 @@ }, "outputs": [], "source": [ - "!wget https://github.com/NVIDIA/NeMo/releases/download/v0.11.0/test_data.tar.gz && mkdir -p tests/data && tar xzf test_data.tar.gz -C tests/data\n", + "!wget https://github.com/NVIDIA/NeMo/releases/download/v0.11.0/test_data.tar.gz \\\n", + "&& mkdir -p tests/data \\\n", + "&& tar xzf test_data.tar.gz -C tests/data\n", "\n", "# Just like ASR, the Tacotron2 require .json files to define the training and validation data.\n", - "!cat tests/data/asr/an4_val.json\n", + "!cat tests/data/asr/an4_val.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have some sample data, we can try training Tacotron 2!\n", "\n", - "# Now that we have some sample data, we can try training Tacotron 2\n", - "# NOTE: The sample data is not enough data to properly train a Tacotron 2. This will not result in a trained Tacotron 2 and is used to illustrate how to train Tacotron 2 model\n", - "!python tacotron2.py sample_rate=16000 train_dataset=tests/data/asr/an4_train.json validation_datasets=tests/data/asr/an4_val.json trainer.max_epochs=3 trainer.accelerator=null trainer.check_val_every_n_epoch=1" + "Note that the sample data is not enough data to fully train a Tacotron 2 model. The following code uses a toy dataset to illustrate how the pipeline for training would work." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!(python tacotron2.py \\\n", + " model.sample_rate=16000 \\\n", + " train_dataset=tests/data/asr/an4_train.json \\\n", + " validation_datasets=tests/data/asr/an4_val.json \\\n", + " trainer.max_epochs=3 \\\n", + " trainer.accelerator=null \\\n", + " trainer.check_val_every_n_epoch=1 \\\n", + " +trainer.gpus=1)" ] }, { @@ -315,7 +353,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -329,9 +367,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +}