<a href="https://colab.research.google.com/github/DeekshaKarkada/Language-Translation-for-Hindi-Kannada/blob/main/LSTM_OpNMT_BM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Implementing Language translation from Hindi to Kannada using LSTM

using https://github.com/ymoslem/MT-Preparation/tree/main as reference

In [1]:
!pip install sentencepiece



In [2]:
!pip3 install OpenNMT-py

Collecting OpenNMT-py
  Downloading OpenNMT_py-3.5.1-py3-none-any.whl.metadata (8.8 kB)
Collecting torch<2.3,>=2.1 (from OpenNMT-py)
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting configargparse (from OpenNMT-py)
  Downloading ConfigArgParse-1.7-py3-none-any.whl.metadata (23 kB)
Collecting ctranslate2<5,>=4 (from OpenNMT-py)
  Downloading ctranslate2-4.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting waitress (from OpenNMT-py)
  Downloading waitress-3.0.0-py3-none-any.whl.metadata (4.2 kB)
Collecting pyonmttok<2,>=1.37 (from OpenNMT-py)
  Downloading pyonmttok-1.37.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting sacrebleu (from OpenNMT-py)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz (from OpenNMT-py)

In [3]:
import pandas as pd
import numpy as np

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Create a directory and clone the Github MT-Preparation repository
!mkdir nmt
%cd nmt

/content/nmt


In [6]:
import os
data_path = '/content/drive/MyDrive/Thesis/Opennmt_Files/'
os.chdir(data_path)

In [None]:
os.getcwd()

'/content/drive/MyDrive/Thesis/Opennmt_Files'

In [None]:
# Download and unzip a dataset
!wget https://object.pouta.csc.fi/OPUS-NLLB/v1/moses/hi-kn.txt.zip
!unzip hi-kn.txt.zip

--2024-08-18 14:39:37--  https://object.pouta.csc.fi/OPUS-NLLB/v1/moses/hi-kn.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 116758012 (111M) [application/zip]
Saving to: ‘hi-kn.txt.zip’


2024-08-18 14:39:46 (14.3 MB/s) - ‘hi-kn.txt.zip’ saved [116758012/116758012]

Archive:  hi-kn.txt.zip
  inflating: README                  
  inflating: LICENSE                 
  inflating: NLLB.hi-kn.hi           
  inflating: NLLB.hi-kn.kn           
  inflating: NLLB.hi-kn.scores       


In [None]:
import csv

def data_preprocessing(source_file, target_file, source_lang, target_lang, lower=False):

    data_source = pd.read_csv(source_file, names=['Source'], sep="\0", skip_blank_lines=False, on_bad_lines="skip")
    data_target = pd.read_csv(target_file, names=['Target'], sep="\0", skip_blank_lines=False, on_bad_lines="skip")
    data = pd.concat([data_source, data_target], axis=1)  # Join the two dataframes along columns

    # Delete nan
    data = data.dropna()

    data = data.drop_duplicates()

    data = data.replace(r'<.*?>|&lt;.*?&gt;|&?(amp|nbsp|quot);|{}', ' ', regex=True)
    data = data.replace(r'  ', ' ', regex=True)

    data = data.replace(r'^\s*$', np.nan, regex=True)

    data = data.sample(frac=1).reset_index(drop=True)

    # Write the dataframe to two Source and Target files
    source_file = source_file+'-filtered.'+source_lang
    target_file = target_file+'-filtered.'+target_lang

    data_source = data["Source"]
    data_target = data["Target"]

    data_source.to_csv(source_file, header=False, index=False, sep="\n")
    print("Source file Saved:", source_file)
    data_target.to_csv(target_file, header=False, index=False, sep="\n")
    print("Target file Saved:", target_file)


data_preprocessing('NLLB.hi-kn.hi', 'NLLB.hi-kn.kn', 'hi', 'kn')

--- Source Saved: NLLB.hi-kn.hi-filtered.hi
--- Target Saved: NLLB.hi-kn.kn-filtered.kn


In [None]:
import sentencepiece as spm

# Source subword model

source_train_value = '--input=/content/nmt/NLLB.hi-kn.hi --model_prefix=source --vocab_size=50000 --hard_vocab_limit=false --split_digits=true'
spm.SentencePieceTrainer.train(source_train_value)

# Target subword model

target_train_value = '--input=/content/nmt/NLLB.hi-kn.kn --model_prefix=target --vocab_size=50000 --hard_vocab_limit=false --split_digits=true'
spm.SentencePieceTrainer.train(target_train_value)

In [None]:
# Train a SentencePiece model for subword tokenization using subword file from https://github.com/ymoslem/MT-Preparation/tree/main
!python3 2-subword.py source.model target.model NLLB.hi-kn.hi-filtered.hi NLLB.hi-kn.kn-filtered.kn

Source Model: source.model
Target Model: target.model
Source Dataset: NLLB.hi-kn.hi-filtered.hi
Target Dataset: NLLB.hi-kn.kn-filtered.kn
Done subwording the source file! Output: NLLB.hi-kn.hi-filtered.hi.subword
Done subwording the target file! Output: NLLB.hi-kn.kn-filtered.kn.subword


In [None]:
# First 3 lines before subwording
!head -n 5 NLLB.hi-kn.hi-filtered.hi.subword

▁ग्लोबल ▁जापानी ▁कार्यक्रम ▁के ▁इस ▁सप्ताह ▁के ▁भाग ▁में , ▁हम ▁इस ▁बारे ▁में ▁बात ▁करते ▁हैं ▁कि ▁जापान ▁दुनिया ▁भर ▁की ▁तकनीक ▁में ▁अपना ▁अनुभव ▁कैसे ▁साझा ▁करता ▁है ।
▁मैंने ▁उसी ▁के ▁साथ ▁एक ▁होटल ▁में ▁खाना ▁खाया ▁और ▁दोस्त ▁को ▁बाय ▁बोलकर ▁निकल ▁गया .
▁परिपक्व ▁माँ ▁अपने ▁बेटे ▁के ▁दोस्त ▁Fucks
▁एक ▁कदम ▁से ▁नफरत ▁करने ▁के ▁लिए ▁प्यार ▁से !
▁इसमें ▁उनकी ▁मां ▁ने ▁हर ▁कदम ▁पर ▁उनका ▁साथ ▁दिया ।


In [None]:
!head -n 5 NLLB.hi-kn.kn-filtered.kn.subword

▁ಜಪಾನ್ನ ▁ವೇಗವಾಗಿ ▁ಯುರೋಪಿಯನ್ ▁ಮಾರುಕಟ್ಟೆಯಲ್ಲಿ ▁ಶಿ ಂ ಕಾನ್ ಸೆನ್ಸ್ ಗೆ ▁ತರಬೇತಿ ▁ನೀಡುತ್ತದೆ ▁ 2 8 ▁/ ▁ 0 7 ▁/ ▁ 2 0 1 7 ▁ಜಾಗತಿಕ ▁ಜಪಾನೀಸ್ ▁ಕಾರ್ಯಕ್ರಮದ ▁ಈ ▁ವಾರದ ▁ಭಾಗದಲ್ಲಿ , ▁ಜಪಾನ್ ▁ತನ್ನ ▁ತಂತ್ರಜ್ಞಾನದ ▁ಅನುಭವವನ್ನು ▁ವಿಶ್ವದಾದ್ಯಂತ ▁ಹೇಗೆ ▁ಹಂಚಿಕೊಳ್ಳ ುತ್ತದೆ ▁ಎಂಬುದರ ▁ಕುರಿತು ▁ನಾವು ▁ಮಾತನಾಡುತ್ತೇವೆ .
▁ನಾನು ▁ಸ್ನೇಹಿತ ನೊಂದಿಗೆ ▁ಹೋ ಟೆಲಿ ಗೆ ▁ಚಹಾ ▁ಕುಡಿಯಲು ▁ಹೋಗಿದ್ದೆ ▁ಅಲ್ಲಿಗೆ ▁ಅವನೂ ▁ಬಂದಿದ್ದ ▁ಅವನ ▁ಸ್ನೇಹಿತ ನೊಂದಿಗೆ .
▁""" ▁ಪ್ರೌಢ ▁ಸೂಳೆ ▁ತಾಯಿ ▁ತನ್ನ ▁ಮಗನ ▁ಸ್ನೇಹಿತ ▁Fucks ▁"""
▁ಆಶ್ಚರ್ಯ ▁ಮತ್ತು ▁ಪ್ರತಿಯಾಗಿ ▁" ಒಂದು ▁ಹೆಜ್ಜೆ ▁ದ್ವೇಷಿಸಲು ▁ಪ್ರೀತಿ ▁ಗೆ " ▁ಒಂದು ▁ಗಾದೆ ▁ಇದೆ .
▁ಆದರೆ ▁ಅವರ ▁ತಾಯಿ ▁ಮಗಳ ▁ಪ್ರತಿ ▁ಹೆಜ್ಜೆಗೂ ▁ಜೊತೆಯಾಗಿ ▁ನಿಂತರು .


In [None]:
import pandas as pd
import numpy as np
import re
import csv

def split_dataset(segment_no_dev, segment_no_test, source_file, target_file):

    df_source = pd.read_csv(source_file,
                            names=['Source'],
                            sep="\0",
                            quoting=csv.QUOTE_NONE,
                            skip_blank_lines=False,
                            on_bad_lines="skip")
    df_target = pd.read_csv(target_file,
                            names=['Target'],
                            sep="\0",
                            quoting=csv.QUOTE_NONE,
                            skip_blank_lines=False,
                            on_bad_lines="skip")
    data = pd.concat([df_source, df_target], axis=1)
    print("Dataframe shape:", data.shape)

    data = data.dropna()


    # Extract Dev set from the main dataset
    data_dev = data.sample(n = int(segment_no_dev))
    data_train = data.drop(data_dev.index)

    # Extract Test set from the main dataset
    data_test = data_train.sample(n = int(segment_no_test))
    data_train = data_train.drop(data_test.index)

    # Write the dataframe to two Source and Target files
    source_file_train = source_file+'.train'
    target_file_train = target_file+'.train'

    source_file_dev = source_file+'.dev'
    target_file_dev = target_file+'.dev'

    source_file_test = source_file+'.test'
    target_file_test = target_file+'.test'

    data_dic_train = data_train.to_dict(orient='list')


    with open(source_file_train, "w") as sf:
        sf.write("\n".join(line for line in data_dic_train['Source']))
        sf.write("\n")

    with open(target_file_train, "w") as tf:
        tf.write("\n".join(line for line in data_dic_train['Target']))
        tf.write("\n")


    data_dic_dev = data_dev.to_dict(orient='list')

    with open(source_file_dev, "w", encoding='utf-8') as sf:
        sf.write("\n".join(line for line in data_dic_dev['Source']))
        sf.write("\n") # end of file newline

    with open(target_file_dev, "w", encoding='utf-8') as tf:
        tf.write("\n".join(line for line in data_dic_dev['Target']))
        tf.write("\n")


    data_dic_test = data_test.to_dict(orient='list')

    with open(source_file_test, "w", encoding='utf-8') as sf:
        sf.write("\n".join(line for line in data_dic_test['Source']))
        sf.write("\n")

    with open(target_file_test, "w", encoding='utf-8') as tf:
        tf.write("\n".join(line for line in data_dic_test['Target']))
        tf.write("\n")

    print("Output files", *[source_file_train, target_file_train, source_file_dev, target_file_dev, source_file_test, target_file_test], sep="\n")


segment_no_dev = 6000    # Number of segments in the dev set
segment_no_test = 6000    # Number of segments in the test set
source_file = 'NLLB.hi-kn.hi-filtered.hi.subword'   # Path to the source file
target_file = 'NLLB.hi-kn.kn-filtered.kn.subword'   # Path to the target file

split_dataset(segment_no_dev, segment_no_test, source_file, target_file)

Dataframe shape: (1793892, 2)
Output files
NLLB.hi-kn.hi-filtered.hi.subword.train
NLLB.hi-kn.kn-filtered.kn.subword.train
NLLB.hi-kn.hi-filtered.hi.subword.dev
NLLB.hi-kn.kn-filtered.kn.subword.dev
NLLB.hi-kn.hi-filtered.hi.subword.test
NLLB.hi-kn.kn-filtered.kn.subword.test


In [None]:
config = '''# config.yaml

save_data: run

# Training files
data:
    corpus_1:
        path_src: NLLB.hi-kn.hi-filtered.hi.subword.train
        path_tgt: NLLB.hi-kn.kn-filtered.kn.subword.train
        transforms: [filtertoolong]
    valid:
        path_src: NLLB.hi-kn.hi-filtered.hi.subword.dev
        path_tgt: NLLB.hi-kn.kn-filtered.kn.subword.dev
        transforms: [filtertoolong]

# Vocabulary files
src_vocab: run/source.vocab
tgt_vocab: run/target.vocab

# Vocabulary size
src_vocab_size: 50000
tgt_vocab_size: 50000

src_seq_length: 150
src_seq_length: 150

# Tokenization options
src_subword_model: source.model
tgt_subword_model: target.model

log_file: train.log
save_model: models/model.hikn

early_stopping: 4

# Default: 5000 -
save_checkpoint_steps: 1000

# keep_checkpoint: 3

seed: 3435


train_steps: 10000
valid_steps: 5000


warmup_steps: 1000
report_every: 1000

# Number of GPUs, and IDs of GPUs
#world_size: 1
#gpu_ranks: [0]

# Batching
bucket_size: 262144
num_workers: 0
batch_type: "tokens"
batch_size: 4096
valid_batch_size: 2048
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
# warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: rnn
decoder_type: rnn
rnn_type: LSTM
position_encoding: true
enc_layers: 2
dec_layers: 2
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.3]
attention_dropout: [0.1]


'''

with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

#Build the vocabulary using the config file and for the vocalbulary size as in config file

In [None]:
!onmt_build_vocab -config config.yaml -n_sample -1 -num_threads 1

Corpus corpus_1's weight should be given. We default it to 1 for you.
[2024-08-21 13:03:18,432 INFO] Counter vocab from -1 samples.
[2024-08-21 13:03:18,432 INFO] n_sample=-1: Build vocab on full datasets.
[2024-08-21 13:04:04,810 INFO] * Transform statistics for corpus_1(100.00%):
			* FilterTooLongStats(filtered=2)

[2024-08-21 13:04:04,917 INFO] Counters src: 52785
[2024-08-21 13:04:04,917 INFO] Counters tgt: 52079


#Train the model for the source and taget files and other hyper parameters as set in config file

In [None]:
!onmt_train -config config.yaml

[2024-08-21 13:04:35,623 INFO] Parsed 2 corpora from -data.
[2024-08-21 13:04:35,624 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2024-08-21 13:04:35,856 INFO] The first 10 tokens of the vocabs are:['<unk>', '<blank>', '<s>', '</s>', '।', '▁', '▁के', '▁है', '▁में', ',']
[2024-08-21 13:04:35,857 INFO] The decoder start token is: <s>
[2024-08-21 13:04:35,857 INFO] Building model...
[2024-08-21 13:04:37,196 INFO] Switching model to float32 for amp/apex_amp
[2024-08-21 13:04:37,197 INFO] Non quantized layer compute is fp16
[2024-08-21 13:04:37,429 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(50000, 512, padding_idx=1)
        )
        (pe): PositionalEncoding()
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Emb

In [None]:
# translate the sentences of Hindi test data for the trained model and store to another file

!onmt_translate -model models/model.hikn_step_10000.pt -src NLLB.hi-kn.hi-filtered.hi.subword.test -output NLLB.kn.translated -min_length 1

[2024-08-18 16:47:49,733 INFO] Loading checkpoint from models/model.fren_step_10000.pt
[2024-08-18 16:48:00,062 INFO] Loading data into the model
[2024-08-18 16:48:43,958 INFO] PRED SCORE: -0.8908, PRED PPL: 2.44 NB SENTENCES: 2000
Time w/o python interpreter load/terminate:  54.826354026794434


In [None]:
!head -n 3 NLLB.kn.translated

▁ಉತ್ಪಾದಕತೆ ಗೆ ▁ಸಂಬಂಧಿಸಿದ ▁ಅಂಶಗಳು .
▁ಕಂದಹಾರ್ ▁ವಿಮಾನ ▁ನಿಲ್ದಾಣದಲ್ಲಿ ▁ರಾಕೆಟ್ ▁ದಾಳಿ
▁ಅಧ್ಯಯನ ▁ಮುಂದುವರೆದಿದೆ .


# Detokenize the translated file and test file

Reverse the tokenization process on the file containing tokenized translation results using a detokenizer function by loading the 'target_model' along with the tokenized file

In [7]:
import sentencepiece as spm

def detokenization(target_model, pred_target):
  target_decodeded = pred_target + ".desubword"
  sp = spm.SentencePieceProcessor()
  sp.load(target_model)
  with open(pred_target) as pred, open(target_decodeded, "w+") as pred_decoded:
    for line in pred:
        line = line.strip().split(" ")
        line = sp.decode_pieces(line)
        pred_decoded.write(line + "\n")

  print("Desubword file:", target_decodeded)



In [8]:
detokenization('target.model', 'NLLB.kn.translated')

Desubword file: NLLB.kn.translated.desubword


In [9]:
!head -n 5 NLLB.kn.translated.desubword

ಉತ್ಪಾದಕತೆಗೆ ಸಂಬಂಧಿಸಿದ ಅಂಶಗಳು.
ಕಂದಹಾರ್ ವಿಮಾನ ನಿಲ್ದಾಣದಲ್ಲಿ ರಾಕೆಟ್ ದಾಳಿ
ಅಧ್ಯಯನ ಮುಂದುವರೆದಿದೆ.
"ಈ ಮೊದಲು, ನನ್ನ ಹೆಂಡತಿಗೂ ಕೊರೋನಾ ಇರುವುದು ದೃಢಪಟ್ಟಿತ್ತು, ಆದರೆ ನಮ್ಮ
ವೆಚ್ಚ, ಇತ್ಯಾದಿ


In [10]:
detokenization('target.model', 'NLLB.hi-kn.kn-filtered.kn.subword.test')

Desubword file: NLLB.hi-kn.kn-filtered.kn.subword.test.desubword


In [11]:
!head -n 3 NLLB.hi-kn.kn-filtered.kn.subword.test.desubword


೫. ವ್ಯವಸ್ಥಾಪಕ ಉತ್ಪಾದಕತೆ ಸಂಬಂಧಪಟ್ಟಿದೆ.
ಕಂದಹಾರ್ ವಿಮಾನ ನಿಲ್ದಾಣದ ಮೇಲೆ ರಾಕೆಟ್ ದಾಳಿ
ಈಗಾಗಲೇ ಪಠ್ಯ ಬೋಧನೆ ಮುಗಿದಿದ್ದು ಪುನರಾವರ್ತನೆ ಆಗುತ್ತಿದೆ.


In [13]:
!pip install sacrebleu



#Evaluate the result using bleu score

In [14]:
!sacrebleu NLLB.hi-kn.kn-filtered.kn.subword.test.desubword -i NLLB.kn.translated.desubword -m bleu

{
 "name": "BLEU",
 "score": 12.8,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "40.4/16.8/9.9/6.4 (BP = 0.887 ratio = 0.893 hyp_len = 14961 ref_len = 16763)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
}
[0m

In [19]:

target_test = 'NLLB.hi-kn.kn-filtered.kn.subword.test.desubword'
target_pred = 'NLLB.kn.translated.desubword'

refs = []
preds = []

def load_file(data_file, data_list):
  with open(data_file) as test:
    for line in test:
      line = line.strip()
      data_list.append(line)
    return data_list


refs = load_file(target_test, refs)
preds = load_file(target_pred, preds)

for i in range(2):
  print("Reference sentence:", refs[i])
  print("Translated sentence:", preds[i])


Reference sentence: ೫. ವ್ಯವಸ್ಥಾಪಕ ಉತ್ಪಾದಕತೆ ಸಂಬಂಧಪಟ್ಟಿದೆ.
Translated sentence: ಉತ್ಪಾದಕತೆಗೆ ಸಂಬಂಧಿಸಿದ ಅಂಶಗಳು.
Reference sentence: ಕಂದಹಾರ್ ವಿಮಾನ ನಿಲ್ದಾಣದ ಮೇಲೆ ರಾಕೆಟ್ ದಾಳಿ
Translated sentence: ಕಂದಹಾರ್ ವಿಮಾನ ನಿಲ್ದಾಣದಲ್ಲಿ ರಾಕೆಟ್ ದಾಳಿ
