In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import json
import numpy as np
import re
import string
import nltk
import pandas as pd
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/MyDrive/test_data1_final.json', 'r') as file:
    data = json.load(file)

In [None]:
en_test = []
id_test = []

In [None]:
!pip install spacy



In [None]:
import spacy

In [None]:
for lang_pair, lang_data in data.items():
  if (lang_pair == "English-Bengali"):
    for ty, entries in lang_data.items():
      print("TYPE is ", ty)
      if (ty == "Test"):
        for id, entry_data in entries.items():
          en_test.append(entry_data['source'])
          id_test.append(id)
      else:
        pass;

print(len(id_test))

TYPE is  Test
19672


In [None]:
def process(sentence): # cleaning the sentences
  sentence = ''.join([char for char in sentence if char not in string.punctuation and not char.isdigit()])
  return sentence

In [None]:
nlp_en = spacy.load("en_core_web_sm")

In [None]:
# Tokenizing the sentences
def work_eng(sentences):
  replace_sentences = []
  done = 0
  for sentence in sentences:
    sentence = sentence.lower()
    sentence = process(sentence)
    sentence = [token.text for token in nlp_en.tokenizer(sentence)]
    replace_sentences.append(sentence)
    done += 1
    if (done % 10000 == 0):
      print(f"DONE {done}")
  return replace_sentences

In [None]:
en_test = work_eng(en_test)

DONE 10000


In [None]:
en_test_df = pd.DataFrame(en_test)
en_test_df.to_csv('/content/drive/MyDrive/en_test2.csv', index=False) # saving to a file

In [None]:
# using the presaved files for the train and val datasets

en_train_df = pd.read_csv('/content/drive/MyDrive/datasets/en_train2.csv')
hn_train_df = pd.read_csv('/content/drive/MyDrive/datasets/hn_train2.csv')
en_val_df = pd.read_csv('/content/drive/MyDrive/datasets/en_val2.csv')

en_train = [[v for v in row if not pd.isna(v)] for row in en_train_df.values.tolist()]
hn_train = [[v for v in row if not pd.isna(v)] for row in hn_train_df.values.tolist()]
en_val = [[v for v in row if not pd.isna(v)] for row in en_val_df.values.tolist()]

  en_train_df = pd.read_csv('/content/drive/MyDrive/datasets/en_train2.csv')
  hn_train_df = pd.read_csv('/content/drive/MyDrive/datasets/hn_train2.csv')
  en_val_df = pd.read_csv('/content/drive/MyDrive/datasets/en_val2.csv')


In [None]:
en_dict = {}
hn_dict = {}

#creating the dictionaries

def add(dict, word):
  if word not in dict:
    dict[word] = len(dict)
  return

extras = ['<EOS>', '<SOS>', '<PAD>']

for word in extras:
  add(en_dict, word)
  add(hn_dict, word)

for sentence in en_train:
  for word in sentence:
    add(en_dict, word)

for sentence in hn_train:
  for word in sentence:
    add(hn_dict, word)

for sentence in en_val:
  for word in sentence:
    add(en_dict, word)

In [None]:
print(len(en_dict))
print(len(hn_dict))

57487
99392


In [None]:
MAX_LEN = 23 # 23 for Bengali, 27 for Hindi

In [None]:
def append(sentence):
  sentence.insert(0, '<SOS>')
  while (len(sentence) < MAX_LEN - 1):
    sentence.append('<PAD>')
  while (len(sentence) > MAX_LEN - 1):
    sentence.pop()
  sentence.append('<EOS>')
  return sentence

In [None]:
def encode(dict, sentence):
  sentence = append(sentence)
  res = []
  for word in sentence:
    if word in dict:
      res.append(dict[word])
    else:
      res.append(2)
  return res

In [None]:
def encode_sentences(dict, sentences):
  res = []
  for sentence in sentences:
    res.append(encode(dict, sentence))
  return res

In [None]:
en_tokentrain = encode_sentences(en_dict, en_train)
hn_tokentrain = encode_sentences(hn_dict, hn_train)
en_tokentest = encode_sentences(en_dict, en_test)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-11-25 19:29:40--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-25 19:29:41--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-25 19:29:42--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
embedding_size = 200

In [None]:
glove_path = "glove.6B.200d.txt"

embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = vector

vocab_size = len(en_dict)
embedding_matrix = np.random.uniform(-0.1, 0.1, (vocab_size, embedding_size))

for word, idx in en_dict.items():
  if word in embeddings_index:
    embedding_matrix[idx] = embeddings_index[word]

In [None]:
# Encoder LSTM
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True) # ppretrained embedding, freeze = true because this should not be changed
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers)

  def forward(self, x):
    embedding = self.embedding(x)
    outputs, (hidden, cell) = self.lstm(embedding)
    return hidden, cell

In [None]:
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.lstm = nn.LSTM(embedding_size + hidden_size, hidden_size, num_layers)
    self.attention = nn.Linear(hidden_size, hidden_size)
    self.fun = nn.Linear(hidden_size, input_size)

  def forward(self, x, hidden, cell):
    x = x.unsqueeze(0)
    embedding = self.embedding(x)

    context = torch.tanh(self.attention(hidden[-1]))
    context = context.unsqueeze(0)

    result = torch.cat((embedding, context), dim=2)

    outputs, (hidden, cell) = self.lstm(result, (hidden, cell))
    predictions = self.fun(outputs)
    predictions = predictions.squeeze(0)

    return predictions, hidden, cell

In [None]:
num_epochs = 13
learning_rate = 0.001
batch_size = 50
inp_size = len(en_dict)
out_size = len(hn_dict)

hidden_size = 512
num_layers = 2

In [None]:
encoder = Encoder(inp_size, embedding_size, hidden_size, num_layers).to(device)
encoder

Encoder(
  (embedding): Embedding(57487, 200)
  (lstm): LSTM(200, 512, num_layers=2)
)

In [None]:
decoder = Decoder(out_size, embedding_size, hidden_size, num_layers).to(device)
decoder

Decoder(
  (embedding): Embedding(99392, 200)
  (lstm): LSTM(200, 512, num_layers=2)
  (fun): Linear(in_features=512, out_features=99392, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
enc_optimizer = torch.optim.Adam(encoder.parameters(), lr = learning_rate)
dec_optimizer = torch.optim.Adam(decoder.parameters(), lr = learning_rate)

In [None]:
train_x = np.array(en_tokentrain)
train_y = np.array(hn_tokentrain)
test_x = np.array(en_tokentest)

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_ds = TensorDataset(torch.from_numpy(test_x))

train_dl = DataLoader(train_ds, shuffle=False, batch_size=batch_size, drop_last=True)
test_dl = DataLoader(test_ds, shuffle=False, batch_size=1, drop_last=True)

In [None]:
# Finding the presaved models and reviving them to save training time
path = '/content/drive/MyDrive/model_checkpoint_bengali_233.pth'
checkpoint = torch.load(path)

encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])
enc_optimizer.load_state_dict(checkpoint['enc_optimizer_state_dict'])
dec_optimizer.load_state_dict(checkpoint['dec_optimizer_state_dict'])

  checkpoint = torch.load(path)


In [None]:
hn_revdict = {v: k for k, v in hn_dict.items()}

In [None]:
SOS = en_dict['<SOS>']
EOS = en_dict['<EOS>']
PAD = en_dict['<PAD>']

In [None]:
# Translate function as defined before
def translate_tensor(sentence_tensor):
  input_tensor = sentence_tensor.to(device)

  enc_optimizer.zero_grad()
  dec_optimizer.zero_grad()

  len = MAX_LEN

  hidden, cell = encoder(input_tensor)

  out = [SOS]

  for t in range(1, len):
    last = torch.LongTensor([out[-1]]).to(device)
    output, hidden, cell = decoder(last, hidden, cell)
    best = output.argmax(1)
    out.append(best.to('cpu').item())
    if (best.item() == EOS):
      break

    last = best

  return out

In [None]:
def translate_lang(sentence_tensor):
  translated_tensor = translate_tensor(sentence_tensor)
  translated = []
  for x in translated_tensor:
    if (x == EOS or x == PAD or x == SOS):
      continue

    translated.append(hn_revdict[x])

  result = " ".join(translated)
  return result

In [None]:
for idx, batch in enumerate(test_dl):
  input_tensor = batch[0].to(device)
  input_tensor = input_tensor.transpose(0, 1)

  print(translate_lang(input_tensor))
  if (idx == 10):
    break

বর্তমান ঘটনা ঘটনা
ভগবান রাম তাঁর নিজের ছেলেকে জিজ্ঞাসা করার চেষ্টা করেছিলেন কিন্তু স্বামী এবং কাপুর হয়ে তাঁর দৃষ্টি থাকার জন্য ।
দিন আগে আর জল দিয়ে লাগিয়ে নিন যে জল আটকে যাওয়া ইত্যাদি বেশি ঠান্ডা হয়ে যায় ৷
যে সে যখন পরিবারের সাথে লড়াই করেন তখন তিনি আবার হাল্কা মেরে তখন তাঁর বাড়িতে ঢুকে পড়ে এবং সে স্পর্শ করে
ন্যুনতম আখ চীনাবাদাম খাওয়া খুব খুব বেশী ।
আমার স্কুলের সাথে ডিনার কি
সালে ভারত থেকে জানা যায় যে ভারতে অনেক হিট রূপে পিছিয়ে ছিলেন না ।
সর্দার প্যাটেলের প্রাক্তন কার্যনির্বাহী পদ্ধতি হলেন হলেন মুখ্যসচিব ।
মজার কারণ যে সারা বিশ্বে কিছু কিছু জায়গা আর এমন এক সুন্দর জায়গা আছে যা আপনি নিজের পরিচয় করতে পারেন ৷
এটি তার সাথে নিজস্ব স্বতন্ত্র ।
এখানে সময়ের সাথে লাগোয়া ব্যবসার জন্য বিখ্যাত ৷


In [None]:
#Running the translate on the actual data

val_outs = []
print(len(test_dl))
for idx, batch in enumerate(test_dl):
  input_tensor = batch[0].to(device)
  input_tensor = input_tensor.transpose(0, 1)

  val_outs.append(translate_lang(input_tensor))
  if (idx % 1000 == 0):
    print(f"Done, {idx}")

19672
Done, 0
Done, 1000
Done, 2000
Done, 3000
Done, 4000
Done, 5000
Done, 6000
Done, 7000
Done, 8000
Done, 9000
Done, 10000
Done, 11000
Done, 12000
Done, 13000
Done, 14000
Done, 15000
Done, 16000
Done, 17000
Done, 18000
Done, 19000


In [None]:
answer = pd.DataFrame()
answer["ID"] = id_test
answer["Translation"] = val_outs

In [None]:
answer

Unnamed: 0,ID,Translation
0,177039,বর্তমান ঘটনা ঘটনা
1,177040,ভগবান রাম তাঁর নিজের ছেলেকে জিজ্ঞাসা করার চেষ্...
2,177041,দিন আগে আর জল দিয়ে লাগিয়ে নিন যে জল আটকে যাওয়া...
3,177042,যে সে যখন পরিবারের সাথে লড়াই করেন তখন তিনি আবা...
4,177043,ন্যুনতম আখ চীনাবাদাম খাওয়া খুব খুব বেশী ।
...,...,...
19667,196706,এই জন্য সারা দিন আমাদের ফুসফুসে বেশি গাঁট পড়ে ৷
19668,196707,আজকের থাই মধ্যে প্রায় টি টি দেশ জুড়ে যারা প্র...
19669,196708,এটা হাল্কা গরম জলে হলুদ রেড মিট বা ফলের রস ট্য...
19670,196709,ইউ শেষ হতে আবার অতিরিক্ত দেরী করা উচিত ।


In [None]:
answer.to_csv("/content/drive/MyDrive/answers v1/answer_test_bengali_1.csv", index=False)