# **NLP From Scratch**: Translation with a Sequence to Sequence Network and Attention
  + In this project we will be teaching a neural network to translate from French to English.

    -  [KEY: > input, = target, < output]
      
       ####  > il est en train de peindre un tableau .  
       ####  = he is painting a picture .
       ####  < he is painting a picture . 

In [36]:
from __future__ import  unicode_literals, print_function, division

from io import open
import unicodedata
import re
import random 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Edit with Google Colab,, put images and explanations

In [37]:
S = 0 #start of sentence
E = 1 #end of sentnce

class Lang:
    def __init__(self, langName):
        self.langName = langName
        self.wtoi = {} #word to indx
        self.wordToCount = {}
        self.itow = {0:'<S>', 1:'<E'} # indx to word
        self.nwords = 2
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.wtoi:
            self.wtoi[word] = self.nwords
            self.wordToCount[word] = 1
            self.itow[self.nwords] = word
            self.nwords += 1  # Count S and E
        
        else:
            self.wordToCount[word] += 1
    




# The files are all in Unicode, to simplify we will turn Unicode characters to ASCII, make everything lowercase, and trim most punctuation.

In [38]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters

def normalizStr(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r'([.!?])', r" \1", s)
    s = re.sub(r'[^a-zA-Z!?]+', r' ', s)
    return s.strip()


#### To read the data file we will split the file into lines, and then split lines into pairs. The files are all English → Other Language, so if we want to translate from Other Language → English I added the reverse flag to reverse the pairs.

In [39]:

def loadData(filePath, reverse=False):
    pairs = []
    with open(filePath, 'r', encoding='utf-8') as f:
        for line in f:
            eng, fre = line.strip().split('\t') #english french

            if reverse:
                pairs.append([fre, eng])
            
            else:
                pairs.append([eng, fre])

    return pairs    

##### Since there are a lot of example sentences and we want to train something quickly, we’ll trim the data set to only relatively short and simple sentences. Here the maximum length is 10 words (that includes ending punctuation) and we’re filtering to sentences that translate to the form “I am” or “He is” etc. (accounting for apostrophes replaced earlier).

In [40]:
MAX_LEN = 10

engPrefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filtrPair(p):
    return len(p[0].split(' ')) < MAX_LEN and len(p[1].split(' ')) < MAX_LEN and p[1].startswith(engPrefixes)

def filtrPairs(pairs):
    return [pair for pair in pairs if filtrPair(pair)]


# The full process for preparing the data is:
   
   + Read text file and split into lines, split lines into pairs

   + Normalize text, filter by length and content

   + Make word lists from sentences in pairs

In [41]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizStr(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [42]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))  # Check if pairs were loaded correctly

    pairs = filtrPairs(pairs)
    print("Trimmed to %s sentence pairs after filtering" % len(pairs))  # Check after filtering
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.langName, input_lang.nwords)
    print(output_lang.langName, output_lang.nwords)
    if not pairs:
        print("Warning: No pairs found after filtering. Check data or filtering criteria.")
    
    # The rest of your code for counting words etc.
    return input_lang, output_lang, pairs


In [43]:
inputLang, outputLang, pairs = prepareData('eng', 'fra', reverse=True)
print(random.choice(pairs))


Reading lines...
Read 135842 sentence pairs
Trimmed to 11445 sentence pairs after filtering
Counting words...
Counted words:
fra 4601
eng 2991
['je suis fidele', 'i m faithful']


In [44]:
class EncoderRNN(nn.Module):
    def __init__(self, nInput, nHidden, dropoutP=0.1):
        super(EncoderRNN, self).__init__()
        self.nHidden = nHidden

        self.emb = nn.Embedding(nInput, nHidden)
        self.gru = nn.GRU(nHidden, nHidden, batch_first=True)
        self.dropout = nn.Dropout(dropoutP)

    
    def forward(self, x):
        emb = self.dropout(self.emb(x))
        x, hidden = self.gru(emb)
        return x, hidden
    

In [45]:
class DecoderRNN(nn.Module):
    def __init__(self, nHidden, nOutput):
        super(DecoderRNN, self).__init__()
        self.emb = nn.Embedding(nOutput, nHidden)
        self.gru = nn.GRU(nHidden, nHidden, batch_first=True)
        self.out = nn.Linear(nHidden, nOutput)
    
    def forward(self, encOutput, encHidden, targtTensor=None):
        batchSiz = encOutput.size(0)
        decodInput = torch.empty(batchSiz, 1, dtype=torch.long, device=device).fill_(S)
        
        decodHidden = encHidden
        decodOutputs = []

        for i in range(MAX_LEN):
            decodOutput, decodHidden = self.forwardStep(decodInput, decodHidden)
            decodOutputs.append(decodOutput)

            if targtTensor is not None:
                decodInput = targtTensor[:, i].unsqueeze(1)
            
            else:
                _, topi = decodOutput.topk(i)
                decodInput = topi.squeeze(-1).detach()
        
        decodOutputs = torch.cat(decodOutputs, dim=1)
        decodOutputs = F.log_softmax(decodOutputs, dim=1)
        
        return decodOutputs, decodHidden, None 

    def forwardStep(self, x, hidden):
        x = self.emb(x)
        x = F.relu(x)
        x, hidden = self.gru(x, hidden)
        x = self.out(x)
        return x, hidden
    



In [None]:
def train(inputTensor, targtTensor, enc, decod, encOptim, decodOptim, criterion, maxLen=MAX_LEN, teacherForceRatio=0.5):
    encOptim.zero_grad()
    decodOptim.zero_grad()

    loss = 0
        
    inputTensor = inputTensor.to(device)
    targtTensor = targtTensor.to(device)

    encHidden = enc.init_hidden(batch_size=inputTensor.size(0))
    encOutput, encHidden = enc(inputTensor, encHidden)

    decodHidden = encHidden
    decodInput = torch.tensor([[S]] * inputTensor.size(0), device=device)

    useTeacherForcing = True if torch.randn(1).item() < teacherForceRatio else False

    if useTeacherForcing:
        for i in range(targtTensor.size(1)):
            decodOutput, decodHidden = decod(decodInput, decodHidden)
            loss += criterion(decodOutput, targtTensor[:, i])
            decodInput = targtTensor[:, i].unsqueeze(1)
    
    else:
        for i in range(targtTensor.size(1)):
            decodOutput, decodHidden = decod(decodInput, decodHidden)
            loss += criterion(decodOutput, targtTensor[:, 1])

            _, topi = decodInput.topk(1)
            decodOutput = topi.squeeze(-1).detach()
    
    loss.backward()

    encOptim.step()
    decodOptim.step()

    return loss.item() / targtTensor.size(1)





In [49]:
inputSiz = inputLang.nwords
outputSiz = outputLang.nwords

hiddenSiz = 256
teacherForcingRate = 0.5
maxSteps = 1000
lr = 0.01

# Initialize models, optimizers, and loss function
enc = EncoderRNN(inputSiz, hiddenSiz).to(device)
decod = DecoderRNN(hiddenSiz, outputSiz).to(device)

encOptim = optim.SGD(enc.parameters(), lr=lr)
decodOptim = optim.SGD(decod.parameters(), lr=lr)
criterion = nn.NLLLoss() # Negative Log Likelihood Loss

for i in range(maxSteps):
    inputTensor, targtTensor = random.choice(pairs)
    loss = train(inputTensor, targtTensor, enc, decod, encOptim, decodOptim, criterion)

    if i % 100 == 0:
        print(f"Epoch {i}/{maxSteps}, Loss: {loss:.4f}")
    break


AttributeError: 'str' object has no attribute 'to'