**Fine tune NLLB-600M model with a source and target Bible translation.**


*Use the model to translate from the source to the target.*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Confirm GPU availability  (An A100 with at least 40Gb is required to train NLLB models)
!nvidia-smi

Fri Jun 16 10:04:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Set some GPU options
WORLD_SIZE = "1"
GPU_RANKS = "0"

# For a multi-GPU environment, specify which GPU's should be used for CUDA (not really necessary)
!export CUDA_VISIBLE_DEVICES=0

In [None]:
# Clone the repo
!git clone https://github.com/sillsdev/silnlp

# Install machine.py
!pip install sil-machine

import os
# Tell the SIL NLP tools where to find the Gutenberg resources
os.environ['SIL_NLP_DATA_PATH'] = "/content/drive/MyDrive/NLP"

# The SIL_NLP_DATA_PATH should contain these folders:  'MT' and 'Paratext'
# The MT folder contains:
# /MT/experiments/experiment
# /MT/scripture

# Paratext projects go in this folder. These are the source of the files that are extracted into the vref format.
# These are also the source files for the silnlp.nmt.translate command.
# /Paratext/projects

# These are optional folders for training with other corpora or terms lists.
# /MT/corpora
# /MT/terms

# Tell Python where to find our repo
os.environ['PYTHONPATH'] = "/env/python:/content/silnlp"

# Install the required version of poetry. Version 1.2.2 required by ClearML the specific version may not be essential in Colab.

# This is the recommended install method for poetry on desktops:
#!curl -sSL https://install.python-poetry.org | python - --version 1.2.2

!pip install poetry==1.2.2

Cloning into 'silnlp'...
remote: Enumerating objects: 5285, done.[K
remote: Counting objects: 100% (1221/1221), done.[K
remote: Compressing objects: 100% (369/369), done.[K
remote: Total 5285 (delta 912), reused 1107 (delta 849), pack-reused 4064[K
Receiving objects: 100% (5285/5285), 16.83 MiB | 14.96 MiB/s, done.
Resolving deltas: 100% (3788/3788), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sil-machine
  Downloading sil_machine-0.9.3-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.1/238.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting charset-normalizer<3.0.0,>=2.1.1 (from sil-machine)
  Downloading charset_normalizer-2.1.1-py3-none-any.whl (39 kB)
Collecting networkx<3.0.0,>=2.6.3 (from sil-machine)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m85.8 MB/s[

In [None]:
%cd /content/silnlp
!ls

/content/silnlp
clear_ml_linux_setup.md    poetry.lock	   scripts    silnlp
clear_ml_windows_setup.md  pyproject.toml  setup.cfg  tests
LICENSE			   README.md	   setup.py


In [None]:
!poetry --version

[39;1mPoetry[39;22m (version [36m1.2.2[39m)


In [None]:
!poetry install

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  [34;1m•[39;22m [39mInstalling [39m[36mdebugpy[39m[39m ([39m[39;1m1.6.2[39;22m[39m)[39m: [34mDownloading...[39m [39;1m10%[39;22m
  [34;1m•[39;22m [39mInstalling [39m[36mflatbuffers[39m[39m ([39m[39;1m2.0[39;22m[39m)[39m: [34mInstalling...[39m
  [34;1m•[39;22m [39mInstalling [39m[36mfurl[39m[39m ([39m[39;1m2.1.3[39;22m[39m)[39m: [34mInstalling...[39m
  [34;1m•[39;22m [39mInstalling [39m[36mfuture[39m[39m ([39m[39;1m0.18.2[39;22m[39m)[39m: [34mDownloading...[39m [39;1m30%[39;22m
  [34;1m•[39;22m [39mInstalling [39m[36mgast[39m[39m ([39m[39;1m0.4.0[39;22m[39m)[39m: [34mInstalling...[39m
  [34;1m•[39;22m [39mInstalling [39m[36mgoogle-cloud-core[39m[39m ([39m[39;1m2.3.2[39;22m[39m)[39m: [34mInstalling...[39m
  [34;1m•[39;22m [39mInstalling [39m[36mgoogle-pasta[39m[39m ([39m[39;1m0.2.0[39;22m[39m)[39m: [34mInstalling...[39m
  

In [None]:
# This is a quick check that silnlp is installed
!poetry run python -m silnlp.common.extract_corpora --help

[33mThe currently activated Python version 3.10.12 is not supported by the project (>=3.8,<3.9).
Trying to find and use a compatible version.[39m 
Using [36mpython3.8[39m (3.8.10)
2023-06-16 10:06:54,984 - silnlp.common.environment - INFO - Using workspace: /content/drive/Shareddrives/Partnership for Applied Biblical NLP/NLP Tools/silnlp as per environment variable SIL_NLP_DATA_PATH.
usage: extract_corpora.py
       [-h]
       [--include books [books ...]]
       [--exclude books [books ...]]
       [--markers]
       [--lemmas]
       [--project-vrefs]
       [--clearml]
       name
       [name ...]

Extracts
text
corpora
from
Paratext
projects

positional arguments:
  name
    Paratext
    project

optional arguments:
  -h, --help
    show this
    help
    message and
    exit
  --include books [books ...]
    The books
    to include;
    e.g., 'NT',
    'OT', 'GEN'
  --exclude books [books ...]
    The books
    to exclude;
    e.g., 'NT',
    'OT', 'GEN'
  --markers
    Inc

In [None]:
# Paratext projects need to be extracted into the vref (i,e one verse per line) format for training.
# This command does that.
# Each Paratext project argument in the command line is the name of a folder in the Paratext/projects folder.
# The extracted files will be saved in the MT/scripture folder.

# !poetry run python -m silnlp.common.extract_corpora <Paratext_project_1> <Paratext_project_2>

In [None]:
# Create a folder for your experiments under the MT/experiments folder
# Within your folder create a folder for each experiment.
# Create a config file by copying an existing file and modifying the parameters.
# Ask one of the SILNLP team for advice about setting them. There are many and the interactions between them can be unpredictable.
# Look at an existing effective_config file for a similar run to see the list of configurable options.
# Trying to guess good hyperparameters is likely to be a costly and tim consuming process.
# The defaults are generally the best settings and are hard to beat. Damien has set them using years of experience.

# This command will first preprocess the data, then train the model and then translate the test set and calculate the test set
!poetry run python -m silnlp.nmt.experiment --save-checkpoints --mixed-precision --memory-growth <experiment>


In [None]:
!poetry run python -m silnlp.nmt.translate <experiment> --src-project <projec> --trg-iso <ISO> --books EXO JON

[33mThe currently activated Python version 3.10.12 is not supported by the project (>=3.8,<3.9).
Trying to find and use a compatible version.[39m 
Using [36mpython3.8[39m (3.8.10)
2023-06-16 10:08:17,554 - silnlp.common.environment - INFO - Using workspace: /content/drive/Shareddrives/Partnership for Applied Biblical NLP/NLP Tools/silnlp as per environment variable SIL_NLP_DATA_PATH.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2023-06-16 10:08:19,526 - silnlp.common.utils - INFO - Git commit: fa63d1416b
2023-06-16 10:08:22,707 - silnlp.nmt.clearml_connection - INFO - No ClearML task initiated.
Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/content/silnlp/silnlp/nmt/translate.py", line 278, in <module>
    main()
  Fi