In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp /content/drive/MyDrive/bible_verses.csv /content/

In [3]:
import csv

# Input file with both English and Nagamese columns
input_file = "/content/bible_verses.csv"

# Output files for each language
english_file = "english.csv"
nagamese_file = "nagamese.csv"

# Open the input file and read contents
with open(input_file, mode="r", encoding="utf-8") as infile:
    reader = csv.DictReader(infile)

    # Open output files
    with open(english_file, mode="w", encoding="utf-8", newline="") as eng_file, \
         open(nagamese_file, mode="w", encoding="utf-8", newline="") as naga_file:

        # Set up CSV writers for the output files
        eng_writer = csv.writer(eng_file)
        naga_writer = csv.writer(naga_file)

        # Write headers for each file
        eng_writer.writerow(["English"])
        naga_writer.writerow(["Nagamese"])

        # Iterate over each row and write English and Nagamese texts to respective files
        for row in reader:
            eng_writer.writerow([row["English"]])
            naga_writer.writerow([row["Nagamese"]])

print("Files 'english.csv' and 'nagamese.csv' created successfully.")

Files 'english.csv' and 'nagamese.csv' created successfully.


In [4]:
# Create a directory named as machine_translation

!mkdir /content/machine_translation


In [5]:
!mv -t /content/machine_translation english.csv nagamese.csv bible_verses.csv

In [6]:
%cd /content/machine_translation
%pwd

/content/machine_translation


'/content/machine_translation'

In [7]:
!git clone https://github.com/ymoslem/MT-Preparation.git

Cloning into 'MT-Preparation'...
remote: Enumerating objects: 305, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 305 (delta 66), reused 114 (delta 58), pack-reused 174 (from 1)[K
Receiving objects: 100% (305/305), 84.51 KiB | 1.17 MiB/s, done.
Resolving deltas: 100% (149/149), done.


In [8]:

# Install the requirements
!pip3 install -r MT-Preparation/requirements.txt



In [9]:

# get the dataset
with open("/content/machine_translation/english.csv", 'r', encoding='utf-8') as en_file:
    eng_reader = en_file.read()

with open("/content/machine_translation/nagamese.csv", 'r', encoding='utf-8') as ng_file:
    naga_reader = ng_file.read()

# DATA FILTERING

In [10]:
# python3 filter.py <source_file_path> <target_file_path> <source_lang> <target_lang>

!python3 MT-Preparation/filtering/filter.py "/content/machine_translation/nagamese.csv" "/content/machine_translation/english.csv" ng en

Dataframe shape (rows, columns): (4565, 2)
--- Rows with Empty Cells Deleted	--> Rows: 4565
--- Duplicates Deleted			--> Rows: 4544
--- Source-Copied Rows Deleted		--> Rows: 4543
--- Too Long Source/Target Deleted	--> Rows: 4420
--- HTML Removed			--> Rows: 4420
--- Rows will remain true-cased		--> Rows: 4420
--- Rows with Empty Cells Deleted	--> Rows: 4420
--- Rows Shuffled			--> Rows: 4420
--- Source Saved: /content/machine_translation/nagamese.csv-filtered.ng
--- Target Saved: /content/machine_translation/english.csv-filtered.en


# Tokenization/Sub-wording

In [11]:
!ls MT-Preparation/subwording/

1-train_bpe.py		  1-train_unigram.py  3-desubword.py
1-train_unigram_joint.py  2-subword.py	      spm_to_vocab.py


In [12]:
# Train a SentencePiece model for subword tokenization
!python MT-Preparation/subwording/1-train_unigram.py "/content/machine_translation/nagamese.csv-filtered.ng" "/content/machine_translation/english.csv-filtered.en"


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=/content/machine_translation/nagamese.csv-filtered.ng --model_prefix=source --vocab_size=50000 --hard_vocab_limit=false --split_digits=true
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /content/machine_translation/nagamese.csv-filtered.ng
  input_format: 
  model_prefix: source
  model_type: UNIGRAM
  vocab_size: 50000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed

In [13]:
!ls

bible_verses.csv  english.csv-filtered.en  nagamese.csv		     source.model  target.model
english.csv	  MT-Preparation	   nagamese.csv-filtered.ng  source.vocab  target.vocab


In [14]:
# Subword the dataset
!python3 MT-Preparation/subwording/2-subword.py source.model target.model "/content/machine_translation/nagamese.csv-filtered.ng" "/content/machine_translation/english.csv-filtered.en"


Source Model: source.model
Target Model: target.model
Source Dataset: /content/machine_translation/nagamese.csv-filtered.ng
Target Dataset: /content/machine_translation/english.csv-filtered.en
Done subwording the source file! Output: /content/machine_translation/nagamese.csv-filtered.ng.subword
Done subwording the target file! Output: /content/machine_translation/english.csv-filtered.en.subword


In [15]:
# Split the dataset into training set, development set, and test set
# Development and test sets should be between 1000 and 5000 segments (here we chose 2000)
!python MT-Preparation/train_dev_split/train_dev_test_split.py 500 500 "/content/machine_translation/nagamese.csv-filtered.ng.subword" "/content/machine_translation/english.csv-filtered.en.subword"


Dataframe shape: (4420, 2)
--- Empty Cells Deleted --> Rows: 4420
--- Wrote Files
Done!
Output files
/content/machine_translation/nagamese.csv-filtered.ng.subword.train
/content/machine_translation/english.csv-filtered.en.subword.train
/content/machine_translation/nagamese.csv-filtered.ng.subword.dev
/content/machine_translation/english.csv-filtered.en.subword.dev
/content/machine_translation/nagamese.csv-filtered.ng.subword.test
/content/machine_translation/english.csv-filtered.en.subword.test


In [16]:
# !ls
!mv /content/machine_translation /content/drive/MyDrive/machine_translation/