In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import json
import numpy as np
import re
import string
import nltk
import pandas as pd
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#using the stored files and getting the data from them

en_train_df = pd.read_csv('/content/drive/MyDrive/datasets/en_train2.csv')
hn_train_df = pd.read_csv('/content/drive/MyDrive/datasets/hn_train2.csv')
en_val_df = pd.read_csv('/content/drive/MyDrive/datasets/en_val2.csv')

en_train = [[v for v in row if not pd.isna(v)] for row in en_train_df.values.tolist()]
hn_train = [[v for v in row if not pd.isna(v)] for row in hn_train_df.values.tolist()]
en_val = [[v for v in row if not pd.isna(v)] for row in en_val_df.values.tolist()]

  en_train_df = pd.read_csv('/content/drive/MyDrive/datasets/en_train2.csv')
  hn_train_df = pd.read_csv('/content/drive/MyDrive/datasets/hn_train2.csv')
  en_val_df = pd.read_csv('/content/drive/MyDrive/datasets/en_val2.csv')


In [None]:
print(len(en_train))
print(len(hn_train))
print(len(en_val))

68849
68849
9836


In [None]:
print(en_train[0])
print(hn_train[0])

['do', 'not', 'forget', 'to', 'visit', 'the', 'point', 'where', 'the', 'narmada', 'flowing', 'through', 'the', 'marble', 'rocks', 'interchanges', 'its', 'calmness', 'and', 'serenity', 'into', 'insouciance']
['এই', 'জায়গাগুলো', 'দেখতে', 'ভুলো', 'না', 'যেখানে', 'নর্মদা', 'নদী', 'মার্বেল', 'পাথরের', 'পাহাড়ের', 'মধ্য', 'দিয়ে', 'প্রবাহিত', 'হচ্ছে', 'এবং', 'নিজের', 'শান্তি', 'ও', 'সৌন্দর্যকে', 'অনাসক্তিতে', 'পরিণত', 'করছে', '।']


In [None]:
en_dict = {}
hn_dict = {}

# Creating the dictionaries of the training datasets

def add(dict, word):
  if word not in dict:
    dict[word] = len(dict)
  return

extras = ['<EOS>', '<SOS>', '<PAD>']

for word in extras:
  add(en_dict, word)
  add(hn_dict, word)

for sentence in en_train:
  for word in sentence:
    add(en_dict, word)

for sentence in hn_train:
  for word in sentence:
    add(hn_dict, word)

# for sentence in en_val:
#   for word in sentence:
#     add(en_dict, word)

In [None]:
for sentence in en_val:
  for word in sentence:
    add(en_dict, word)

In [None]:
print(len(en_dict))

57487


In [None]:
MAX_LEN = 27

In [None]:
# appending SOS, EOS, and PAD to equal length
def append(sentence):
  sentence.insert(0, '<SOS>')
  while (len(sentence) < MAX_LEN - 1):
    sentence.append('<PAD>')
  while (len(sentence) > MAX_LEN - 1):
    sentence.pop()
  sentence.append('<EOS>')
  return sentence

In [None]:
# encoding sentences into sequences
def encode(dict, sentence):
  sentence = append(sentence)
  res = []
  for word in sentence:
    res.append(dict[word])
  return res

In [None]:
def encode_sentences(dict, sentences):
  res = []
  for sentence in sentences:
    res.append(encode(dict, sentence))
  return res

In [None]:
en_tokentrain = encode_sentences(en_dict, en_train)
hn_tokentrain = encode_sentences(hn_dict, hn_train)
en_tokenval = encode_sentences(en_dict, en_val)

In [None]:
print(en_tokentrain[0])
print(hn_tokentrain[0])

[1, 3, 4, 5, 6, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0]
[1, 3, 4, 5, 6, 7, 8, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0]


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-11-23 08:42:30--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-23 08:42:30--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-23 08:42:31--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
embedding_size = 200

In [None]:
# creating the pretrained embedding matrix from glove with 200 embedding size
glove_path = "glove.6B.200d.txt"

embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = vector

vocab_size = len(en_dict)
embedding_matrix = np.random.uniform(-0.1, 0.1, (vocab_size, embedding_size))

for word, idx in en_dict.items():
  if word in embeddings_index:
    embedding_matrix[idx] = embeddings_index[word]

In [None]:
# Encoder LSTM
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True) # ppretrained embedding, freeze = true because this should not be changed
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers)

  def forward(self, x):
    embedding = self.embedding(x)
    outputs, (hidden, cell) = self.lstm(embedding)
    return hidden, cell

In [None]:
# Decoder LSTM with Attention
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size) # embedding matrix, cannot use glove because glove is only for english

    self.lstm = nn.LSTM(embedding_size + hidden_size, hidden_size, num_layers) # size is embedding_size + hidden_size due to the attention dimension
    self.attention = nn.Linear(hidden_size, hidden_size) # linear function for the attention mechanism
    self.fun = nn.Linear(hidden_size, input_size) # linear function for converting the output to the size of the vocabulary

  def forward(self, x, hidden, cell):
    # x is partial translation till now
    x = x.unsqueeze(0)
    embedding = self.embedding(x)

    context = torch.tanh(self.attention(hidden[-1])) # getting the context by applying the attention mechanism
    context = context.unsqueeze(0)

    result = torch.cat((embedding, context), dim=2) # concateating the embedding and the context

    outputs, (hidden, cell) = self.lstm(result, (hidden, cell))
    predictions = self.fun(outputs) # applying linear function to take it to its correct dimension
    predictions = predictions.squeeze(0)

    return predictions, hidden, cell

In [None]:
# Hyperparameters
num_epochs = 13
learning_rate = 0.001
batch_size = 50
inp_size = len(en_dict)
out_size = len(hn_dict)

hidden_size = 512
num_layers = 2

In [None]:
encoder = Encoder(inp_size, embedding_size, hidden_size, num_layers).to(device)
encoder

Encoder(
  (embedding): Embedding(61101, 200)
  (lstm): LSTM(200, 512, num_layers=2)
)

In [None]:
decoder = Decoder(out_size, embedding_size, hidden_size, num_layers).to(device)
decoder

Decoder(
  (embedding): Embedding(74785, 200)
  (lstm): LSTM(200, 512, num_layers=2)
  (fun): Linear(in_features=512, out_features=74785, bias=True)
)

In [None]:
#defining loss criterion and optimizers
criterion = nn.CrossEntropyLoss()
enc_optimizer = torch.optim.Adam(encoder.parameters(), lr = learning_rate)
dec_optimizer = torch.optim.Adam(decoder.parameters(), lr = learning_rate)

In [None]:
#batching the datasets
train_x = np.array(en_tokentrain)
train_y = np.array(hn_tokentrain)
test_x = np.array(en_tokenval)

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_ds = TensorDataset(torch.from_numpy(test_x))

train_dl = DataLoader(train_ds, shuffle=False, batch_size=batch_size, drop_last=True)
test_dl = DataLoader(test_ds, shuffle=False, batch_size=1, drop_last=True)

In [None]:
SOS = en_dict['<SOS>']
EOS = en_dict['<EOS>']
PAD = en_dict['<PAD>']

# Training loop

for epoch in range(num_epochs):
  losses = []
  for idx, batch in enumerate(train_dl):
    input_tensor = batch[0].to(device)
    target_tensor = batch[1].to(device) # sending everything to device
    target_tensor = target_tensor.transpose(0, 1)
    input_tensor = input_tensor.transpose(0, 1) # making the dimensions correct

    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad() # resetting the gradients

    seq_len = target_tensor.shape[0]
    outputs = torch.zeros(seq_len, batch_size, out_size).to(device)

    hidden, cell = encoder(input_tensor) # callig the encoder to get the initial hidden
    last = target_tensor[0] # this is partial translation, initially SOS

    for t in range(1, seq_len):
      output, hidden, cell = decoder(last, hidden, cell) # using the decoder and getting its output in predicting the next character

      best = output.argmax(1) # best is the most likely next character by argmax on dimension
      outputs[t] = output
      last = target_tensor[t] # teacher forcing so we give it the actual next character

    # print(outputs.shape)
    # print(target_tensor.shape)
    outputs = outputs.reshape(-1, outputs.shape[2])
    target_tensor = target_tensor.reshape(-1) # reshaping the dimensions to pass it to loss
    # print(outputs.shape)
    # print(target_tensor.shape)
    loss = criterion(outputs, target_tensor)

    loss.backward()
    losses.append(loss.item())

    torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1) # clipping the gradients to ensure it doesnt become too large

    enc_optimizer.step()
    dec_optimizer.step() # updating the tuneable parameters

    # print(loss.item())
    # if (idx == 5):
    #   break

    if (idx % 400 == 0):
      val = sum(losses) / losses.__len__()
      print(f"DONE, {idx}, {val}")
  print(f"Epoch {epoch}" )

DONE, 0, 11.222258567810059
DONE, 400, 5.0741376002827785
DONE, 800, 4.760934595460451
DONE, 1200, 4.580705287752302
DONE, 1600, 4.460132755539255
Epoch 0
DONE, 0, 3.7315168380737305
DONE, 400, 3.818614262297861
DONE, 800, 3.7386180911022477
DONE, 1200, 3.6723610747366724
DONE, 1600, 3.626066800134767
Epoch 1
DONE, 0, 3.259782552719116
DONE, 400, 3.299537281740336
DONE, 800, 3.2290496046325834
DONE, 1200, 3.1747143778773173
DONE, 1600, 3.135569767532015
Epoch 2
DONE, 0, 2.8238754272460938
DONE, 400, 2.8794480214392455
DONE, 800, 2.8183450874466724
DONE, 1200, 2.7708697452036963
DONE, 1600, 2.737418176530675
Epoch 3
DONE, 0, 2.4850897789001465
DONE, 400, 2.5368535429462233
DONE, 800, 2.488795018969999
DONE, 1200, 2.4535987906213803
DONE, 1600, 2.4287478272279004
Epoch 4
DONE, 0, 2.208441734313965
DONE, 400, 2.2778837701626253
DONE, 800, 2.239657881108116
DONE, 1200, 2.2123864021626836
DONE, 1600, 2.1926273448552136
Epoch 5
DONE, 0, 1.9877806901931763
DONE, 400, 2.073401280174826
DONE, 8

In [None]:
hn_revdict = {v: k for k, v in hn_dict.items()} # reverse dictionary to convert sequences back to hindi sentences

In [None]:
# translating a sentence
def translate_tensor(sentence_tensor):
  input_tensor = sentence_tensor.to(device)

  enc_optimizer.zero_grad()
  dec_optimizer.zero_grad()

  len = MAX_LEN

  hidden, cell = encoder(input_tensor)

  out = [SOS]

  for t in range(1, len):
    last = torch.LongTensor([out[-1]]).to(device)
    output, hidden, cell = decoder(last, hidden, cell)
    best = output.argmax(1) # this is the most likely token
    out.append(best.to('cpu').item())
    if (best.item() == EOS): # if predicted end of sentence, break
      break

    last = best

  return out

In [None]:
def translate_lang(sentence_tensor):
  translated_tensor = translate_tensor(sentence_tensor)
  translated = []
  for x in translated_tensor:
    if (x == EOS or x == PAD or x == SOS):
      continue # ignore special characters

    translated.append(hn_revdict[x])

  result = " ".join(translated)
  return result

In [None]:
for idx, batch in enumerate(test_dl):
  # small sample of outputs to check
  input_tensor = batch[0].to(device)
  input_tensor = input_tensor.transpose(0, 1)

  print(translate_lang(input_tensor))
  if (idx == 10):
    break

बस इस पर मेरे साथ एक नया रास्ता मिलेगा ।
शराब और अनावश्यक डाइट देना ।
जौ में इसकी खेती मुख्यतः सिंचाई के लिए भी जानी जाती है क्योंकि यह भी एक महत्वपूर्ण फसल है और यह भी है कि जस्ता
इस चूर्ण के चलते तीन दिन पोस्टइन्क्यूबेशन में से अधिक सोया भोजन को तैयार किया जाता है ।
संसार का पौधा संसार का हराभरा एक समुद्री भूमि में फैला हुआ है ।
तो यह शीर्ष वर्ग है लेकिन कुछ भी लोकल वेरिएबल में हैं ।
कोलोरेक्टल कैंसर फेफड़े स्तन कैंसर हृदय रोग सुन्न व रीढ़ की हड्डी में कैंसर का खतरा बहुत बढ़ जाता है ।
तो अगर यहां समाप्त हो तो फिर हमें यहां दो या तीन बार करें ।
बस सड़क के माध्यम से घिरा है जो सड़क द्वारा बनाया जाता है ।
महाबली को पेश किया गया था और उस स्थान पर उसके नाम के ऊपर पहुँच गए थे और वह इस शहर के नाम पर बनाया
यह दुनिया के सबसे पुराने और प्रसिद्ध नृत्य चलचित्र वास्तव में लोक नृत्य के सबसे लोकप्रिय पक्ष में से एक है ।


In [None]:
for i in range(11):
  print(en_val[i])

['<SOS>', 'somebody', 'on', 'this', 'side', 'what', 'will', 'be', 'my', 'goaltest', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
['<SOS>', 'avoid', 'alcohol', 'and', 'illicit', 'drugs', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
['<SOS>', 'barley', 'is', 'also', 'used', 'for', 'malt', 'production', 'which', 'is', 'principally', 'used', 'in', 'brewing', 'industry', 'and', 'proving', 'itself', 'as', 'a', 'good', 'source', 'of', 'better', 'rural', 'livelihood', '<EOS>']
['<SOS>', 'rana', 'daggubati', 'shed', ' ', 'kilograms', 'for', 'this', 'film', 'by', 'eating', 'vegetarian', 'food', 'for', 'six', 'weeks', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
['<SOS>', 'cradle', 'mountainlake'

In [None]:
val_outs = []
# running the model on the whole dataset
print(len(test_dl))
for idx, batch in enumerate(test_dl):
  input_tensor = batch[0].to(device)
  input_tensor = input_tensor.transpose(0, 1)

  val_outs.append(translate_lang(input_tensor))
  if (idx % 1000 == 0):
    print(f"Done, {idx}")

11543
Done, 0
Done, 1000
Done, 2000
Done, 3000
Done, 4000
Done, 5000
Done, 6000
Done, 7000
Done, 8000
Done, 9000
Done, 10000
Done, 11000


In [None]:
id_val = []
#finding the id of the validation data
with open('/content/drive/MyDrive/Data ML Comp 2/val_data1.json', 'r') as file:
  data = json.load(file)

for lang_pair, lang_data in data.items():
  if (lang_pair == "English-Hindi"):
    for ty, entries in lang_data.items():
      print("TYPE is ", ty)
      if (ty == "Train"):
        for id, entry_data in entries.items():
          pass
          # en_train.append(entry_data['source'])
          # hn_train.append(entry_data['target'])
          # id_train.append(id)
      else:
        for id, entry_data in entries.items():
          # en_val.append(entry_data['source'])
          # hn_val.append(entry_data['target'])
          id_val.append(id)

print(len(id_val))

TYPE is  Validation
11543


In [None]:
answer = pd.DataFrame()
answer["ID"] = id_val
answer["Translation"] = val_outs

In [None]:
answer

Unnamed: 0,ID,Translation
0,505511,बस इस पर मेरे साथ एक नया रास्ता मिलेगा ।
1,505512,शराब और अनावश्यक डाइट देना ।
2,505513,जौ में इसकी खेती मुख्यतः सिंचाई के लिए भी जानी...
3,505514,इस चूर्ण के चलते तीन दिन पोस्टइन्क्यूबेशन में ...
4,505515,संसार का पौधा संसार का हराभरा एक समुद्री भूमि ...
...,...,...
11538,517049,भीम अपने पति के साथ उनके पुत्र बने थे और दो दि...
11539,517050,यह किस्म ज्यादातर घास के मैदान में लगाई जा सकत...
11540,517051,मुझे हरियाणा में आंध्र प्रदेश में सबसे पास का ...
11541,517052,में सेंट लुइस की लगभग आधी फुट ऊंचा दर में फैले...


In [None]:
answer.to_csv("/content/drive/MyDrive/answers v1/answersH3.csv", index=False) # saving answer to file

In [None]:
checkpoint_path = '/content/drive/MyDrive/model_checkpoint_hindi_271.pth' # saving the model to a file to reuse

torch.save({
    'encoder_state_dict': encoder.state_dict(),
    'decoder_state_dict': decoder.state_dict(),
    'enc_optimizer_state_dict': enc_optimizer.state_dict(),
    'dec_optimizer_state_dict': dec_optimizer.state_dict(),
}, checkpoint_path)