## How to use the model with the API?

In [1]:
import requests
import json

In [2]:
# supported Indian languages

# Assamese - as
# Bengali - bn
# Gujarati - gu
# Hindi - hi
# Kannada - kn
# Malayalam - ml
# Marathi - mr
# Odia - or
# Punjabi - pa
# Tamil - ta
# Telugu - te

uri = "http://216.48.181.177:5050"

In [3]:
# to translate single sentence, we use the translate_sentence endpoint
API_URL = f"{uri}/translate_sentence"


# In the JSON field of the request, we specify the text to translate, the source and target language
response = requests.post(
    API_URL,
    json={
  "text": "The goal of AI4Bharat is to build language technologies for all Indian languages​",
  "source_language": "en",
  "target_language": "hi"
},
)

In [4]:
output = json.loads(response.text)
print(f"Request completed in {output['duration']} seconds and the translation is {output['text']}")

Request completed in 0.81 seconds and the translation is एआई4भारत का लक्ष्य सभी भारतीय भाषाओं के लिए भाषा प्रौद्योगिकियों का निर्माण करना है


## Explore more on ULCA - https://bhashini.gov.in/ulca/model/explore-models

# Change to Hindi Examples

In [5]:
# to translate a batch of sentences, we use batch_translate endpoint
API_URL = f"{uri}/batch_translate"

# This is a sample batch of 4 tamil sentences. IF you have a large batch of sentences, please break it into smaller batches (typically of size 8 or 16) and query the API multiple times.

sentence_batch = ['இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.',
 'இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது.',
 'இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.',
 'அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.']

# here we give the sentence_batch to "text_lines" and change the source and target language accordingly
response = requests.post(
    API_URL,
    json={
  "text_lines": sentence_batch,
  "source_language": "ta",
  "target_language": "en"
},
)

output = json.loads(response.text)

In [6]:
for src, tgt in zip(sentence_batch, output['text_lines']):
  print(f'src: {src} ----> tgt: {tgt}')

src: இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது. ----> tgt: The pandemic has caused global social and economic disruption.
src: இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. ----> tgt: This has led to the worlds largest recession since the Great Depression.
src: இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது. ----> tgt: This led to the postponement or cancellation of sporting, religious, political and cultural events.
src: அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது. ----> tgt: Due to this fear, there was a shortage of supply as most of the people purchased the items like masks, sanitizers etc.


## CLI - Command Line Inference

### Run on GPU runtime

In [7]:
!nvidia-smi

Wed Jul 27 10:32:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
# create a seperate folder to store everything
!mkdir inference
%cd inference

/content/inference


In [9]:
# clone the repo for running evaluation
!git clone https://github.com/AI4Bharat/indicTrans.git
%cd indicTrans
# clone requirements repositories
!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!git clone https://github.com/rsennrich/subword-nmt.git
%cd ..

Cloning into 'indicTrans'...
remote: Enumerating objects: 650, done.[K
remote: Counting objects: 100% (353/353), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 650 (delta 248), reused 256 (delta 187), pack-reused 297[K
Receiving objects: 100% (650/650), 1.71 MiB | 19.26 MiB/s, done.
Resolving deltas: 100% (375/375), done.
/content/inference/indicTrans
Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1325, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 1325 (delta 91), reused 82 (delta 82), pack-reused 1218[K
Receiving objects: 100% (1325/1325), 9.55 MiB | 7.19 MiB/s, done.
Resolving deltas: 100% (701/701), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126[K
Re

In [10]:
# Install the necessary libraries
!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library
# Install fairseq from source
!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d
!pip install ./
! pip install xformers
%cd ..

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 31.7 MB/s 
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.2.0-py3-none-any.whl (116 kB)
[K     |████████████████████████████████| 116 kB 62.3 MB/s 
[?25hCollecting tensorboardX
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 70.4 MB/s 
Collecting indic-nlp-library
  Downloading indic_nlp_library-0.81-py3-none-any.whl (40 kB)
[K     |████████████████████████████████| 40 kB 5.4 MB/s 
Collecting colorama
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Collecting portalocker
  Downloading portalocker-2.5.1-py2.py3-none-any.whl (15 kB)
Collecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinx-argpar

In [11]:
# add fairseq folder to python path
import os
os.environ['PYTHONPATH'] += ":/content/fairseq/"
# sanity check to see if fairseq is installed
from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils

Error No module named 'triton'
Error No module named 'triton'


In [12]:
# download the indictrans model


# downloading the indic-en model
!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip

# downloading the en-indic model
!wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip

# downloading the indic-indic model
# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip


!unzip indic-en.zip
!unzip en-indic.zip
# !unzip m2m.zip

!rm indic-en.zip
!rm en-indic.zip
# !rm m2m.zip

%cd indicTrans/

--2022-07-27 10:39:04--  https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.24.128, 142.250.4.128, 172.217.194.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.24.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4759117228 (4.4G) [application/zip]
Saving to: ‘indic-en.zip’


2022-07-27 10:40:09 (69.6 MB/s) - ‘indic-en.zip’ saved [4759117228/4759117228]

Archive:  indic-en.zip
   creating: indic-en/
   creating: indic-en/vocab/
  inflating: indic-en/vocab/bpe_codes.32k.SRC  
  inflating: indic-en/vocab/vocab.SRC  
  inflating: indic-en/vocab/vocab.TGT  
  inflating: indic-en/vocab/bpe_codes.32k.TGT  
   creating: indic-en/final_bin/
  inflating: indic-en/final_bin/preprocess.log  
  inflating: indic-en/final_bin/dict.TGT.txt  
  inflating: indic-en/final_bin/test.SRC-TGT.SRC.idx  
  inflating: indic-en/final_bin/test.SRC-TGT.TGT.idx  
  inf

In [13]:
# creating a text file and adding en sentences we can use for testing the model
!touch en_sentences.txt
!echo 'This bicycle is too small for you !!' >> en_sentences.txt
!echo "I will directly meet you at the airport." >> en_sentences.txt
!echo 'If COVID-19 is spreading in your community, stay safe by taking some simple precautions, such as physical distancing, wearing a mask, keeping rooms well ventilated, avoiding crowds, cleaning your hands, and coughing into a bent elbow or tissue' >> en_sentences.txt

In [14]:
# joint_translate takes source file, output file name, source language, target language, model directory as arguments
!bash joint_translate.sh en_sentences.txt outputs.txt 'en' 'te' '../en-indic'

Wed Jul 27 10:41:20 UTC 2022
Applying normalization and script conversion
100% 3/3 [00:00<00:00, 110.49it/s]
Number of sentences in input: 3
Applying BPE
usage: apply_bpe.py [-h] [--input PATH] --codes PATH [--merges INT]
                    [--output PATH] [--separator STR] [--vocabulary PATH]
                    [--vocabulary-threshold INT] [--dropout P]
                    [--glossaries STR [STR ...]] [--seed S]
                    [--num-workers NUM_WORKERS]
apply_bpe.py: error: argument --codes/-c: can't open '../en-indic/vocab/bpe_codes.32k.SRC': [Errno 2] No such file or directory: '../en-indic/vocab/bpe_codes.32k.SRC'
Decoding
Extracting translations, script conversion and detokenization
Translation completed


In [15]:
!cat outputs.txt






In [16]:
!touch lang_sentences.txt
!echo 'तुम आज सुबह यहाँ क्यों आए?' >> lang_sentences.txt
!echo "मेरे परिवार में हर कोई जल्दी उठता है।" >> lang_sentences.txt
!echo ' स्वास्थ्य और परिवार कल्याण मंत्रालय द्वारा प्रदान की गई जानकारी और सलाह को सावधानी व सही तरीके से पालन कर वायरस के स्थानीय प्रसार को रोका जा सकता है।' >> lang_sentences.txt

In [17]:
!bash joint_translate.sh lang_sentences.txt en_outputs.txt 'hi' 'en' '../indic-en'

Wed Jul 27 10:41:44 UTC 2022
Applying normalization and script conversion
100% 3/3 [00:00<00:00, 106.39it/s]
Number of sentences in input: 3
Applying BPE
Decoding
Extracting translations, script conversion and detokenization
Translation completed


In [18]:
! cat en_outputs.txt

Why did you come here this morning?
Everyone in my family wakes up early.
Local transmission of the virus can be prevented by following the information and advice given by the Ministry of Health and Family Welfare in a careful and correct manner.


In [19]:
%cd ..

/content/inference


## How to Train the model?

<img src='https://drive.google.com/uc?id=1uWBFVHkI3QHGByLH_6bkLspANfO7CJ_n' height=300 width=300>

In [None]:
#  lets now download the indictrans data dataset
! wget https://storage.googleapis.com/samanantar-public/V0.3/sample_samanantar_v0.3.zip

# full data - https://storage.googleapis.com/samanantar-public/V0.3/source_wise_splits.zip


# lets also download the benchmarks for dev and test set
! wget https://storage.googleapis.com/samanantar-public/benchmarks.zip

!unzip sample_samanantar_v0.3.zip
!unzip benchmarks.zip

In [None]:
%%bash

wget --trust-server-names https://tinyurl.com/flores200dataset
tar -xf flores200_dataset.tar.gz

mkdir indic-en-exp
# copying all the train folders to exp_dir
cp -r sample_samanantar_v0.3/* indic-en-exp

mkdir -p indic-en-exp/devtest/all

for lang in as bn gu hi kn ml mr or pa ta te
do
mkdir -p indic-en-exp/devtest/all/en-$lang
done

cp flores200_dataset/dev/asm_Beng.dev indic-en-exp/devtest/all/en-as/dev.as
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-as/dev.en

cp flores200_dataset/dev/ben_Beng.dev indic-en-exp/devtest/all/en-bn/dev.bn
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-bn/dev.en

cp flores200_dataset/dev/guj_Gujr.dev indic-en-exp/devtest/all/en-gu/dev.gu
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-gu/dev.en

cp flores200_dataset/dev/hin_Deva.dev indic-en-exp/devtest/all/en-hi/dev.hi
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-hi/dev.en

cp flores200_dataset/dev/kan_Knda.dev indic-en-exp/devtest/all/en-kn/dev.kn
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-kn/dev.en

cp flores200_dataset/dev/mal_Mlym.dev indic-en-exp/devtest/all/en-ml/dev.ml
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-ml/dev.en

cp flores200_dataset/dev/mar_Deva.dev indic-en-exp/devtest/all/en-mr/dev.mr
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-mr/dev.en

cp flores200_dataset/dev/ory_Orya.dev indic-en-exp/devtest/all/en-or/dev.or
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-or/dev.en

cp flores200_dataset/dev/pan_Guru.dev indic-en-exp/devtest/all/en-pa/dev.pa
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-pa/dev.en

cp flores200_dataset/dev/tam_Taml.dev indic-en-exp/devtest/all/en-ta/dev.ta
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-ta/dev.en

cp flores200_dataset/dev/tel_Telu.dev indic-en-exp/devtest/all/en-te/dev.te
cp flores200_dataset/dev/eng_Latn.dev indic-en-exp/devtest/all/en-te/dev.en

cp flores200_dataset/devtest/asm_Beng.devtest indic-en-exp/devtest/all/en-as/test.as
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-as/test.en

cp flores200_dataset/devtest/ben_Beng.devtest indic-en-exp/devtest/all/en-bn/test.bn
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-bn/test.en

cp flores200_dataset/devtest/guj_Gujr.devtest indic-en-exp/devtest/all/en-gu/test.gu
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-gu/test.en

cp flores200_dataset/devtest/hin_Deva.devtest indic-en-exp/devtest/all/en-hi/test.hi
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-hi/test.en

cp flores200_dataset/devtest/kan_Knda.devtest indic-en-exp/devtest/all/en-kn/test.kn
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-kn/test.en

cp flores200_dataset/devtest/mal_Mlym.devtest indic-en-exp/devtest/all/en-ml/test.ml
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-ml/test.en

cp flores200_dataset/devtest/mar_Deva.devtest indic-en-exp/devtest/all/en-mr/test.mr
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-mr/test.en

cp flores200_dataset/devtest/ory_Orya.devtest indic-en-exp/devtest/all/en-or/test.or
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-or/test.en

cp flores200_dataset/devtest/pan_Guru.devtest indic-en-exp/devtest/all/en-pa/test.pa
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-pa/test.en

cp flores200_dataset/devtest/tam_Taml.devtest indic-en-exp/devtest/all/en-ta/test.ta
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-ta/test.en

cp flores200_dataset/devtest/tel_Telu.devtest indic-en-exp/devtest/all/en-te/test.te
cp flores200_dataset/devtest/eng_Latn.devtest indic-en-exp/devtest/all/en-te/test.en'



In [24]:
%cd indicTrans

/content/inference/indicTrans


In [25]:
# prepare_data_joint_training.sh takes experiment dir, src_lang, tgt_lang as input 
# This does preprocessing, building vocab, binarization for joint training

# The learning  and applying vocabulary will take a while if the dataset is huge. To make it faster, run it on a multicore system

! bash prepare_data_joint_training.sh '../indic-en-exp' 'indic' 'en'

Running experiment ../indic-en-exp on indic to en
Applying normalization and script conversion for train
100% 1000/1000 [00:00<00:00, 8144.85it/s]
100% 1000/1000 [00:00<00:00, 1992.56it/s]
Number of sentences in train: 1000
Applying normalization and script conversion for dev
100% 997/997 [00:00<00:00, 7115.61it/s]
100% 997/997 [00:00<00:00, 2039.18it/s]
Number of sentences in dev: 997
Applying normalization and script conversion for test
100% 1012/1012 [00:00<00:00, 5973.46it/s]
100% 1012/1012 [00:00<00:00, 1954.44it/s]
Number of sentences in test: 1012
Applying normalization and script conversion for train
100% 1000/1000 [00:00<00:00, 11588.84it/s]
100% 1000/1000 [00:00<00:00, 2206.67it/s]
Number of sentences in train: 1000
Applying normalization and script conversion for dev
100% 997/997 [00:00<00:00, 7105.96it/s]
100% 997/997 [00:00<00:00, 1978.27it/s]
Number of sentences in dev: 997
Applying normalization and script conversion for test
100% 1012/1012 [00:00<00:00, 6369.11it/s]
100

In [26]:
!(fairseq-train ../indic-en-exp/final_bin \
--max-source-positions=210 \
--max-target-positions=210 \
--max-update=1000 \
--save-interval=1 \
--arch=transformer_4x \
--criterion=label_smoothed_cross_entropy \
--source-lang=SRC \
--lr-scheduler=inverse_sqrt \
--target-lang=TGT \
--label-smoothing=0.1 \
--optimizer adam \
--adam-betas "(0.9, 0.98)" \
--clip-norm 1.0 \
--warmup-init-lr 1e-07 \
--lr 0.0005 \
--warmup-updates 4000 \
--dropout 0.2 \
--save-dir ../indic-en-exp/model \
--keep-last-epochs 5 \
--patience 5 \
--skip-invalid-size-inputs-valid-test \
--fp16 \
--user-dir model_configs \
--update-freq=1 \
--distributed-world-size 1 \
--max-tokens 1024)

# Important Arguments
# --max-updates         -> maximum update steps the model will be trained for
# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer  base)
# --user_dir            -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture
# --lr                  -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.
# --max_tokens          -> this is max tokens per batch. You should limit to lower values if you get oom errors.
# --update-freq         -> gradient accumulation steps

Error No module named 'triton'
Error No module named 'triton'
2022-07-27 10:48:02 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': 'model_configs', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma

In [None]:
%cd ..