<a href="https://colab.research.google.com/github/Deepshika-286/language-translator/blob/main/translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas
import re
import string
import pickle
from pickle import load,dump
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
from unicodedata import normalize
from tensorflow.keras.utils import pad_sequences, to_categorical, plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense






In [None]:
# load doc into memory
def load_doc(filename):
 # open the file as read only
 file = open(filename, mode='rt', encoding='utf-8')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text

In [None]:
# split a loaded document into sentences
def to_pairs(doc):
 lines = doc.strip().split('\n')
 pairs = [line.split('\t') for line in  lines]
 return pairs

In [None]:
# clean a list of lines
def clean_pairs(lines):
  cleaned = list()
 # prepare regex for char filtering
  re_print = re.compile('[^%s]' % re.escape(string.printable))
 # prepare translation table for removing punctuation
  table = str.maketrans('', '', string.punctuation)
  for pair in lines:
    clean_pair = list()
    for line in pair:
 # normalize unicode characters
      line = normalize('NFD', line).encode('ascii', 'ignore')
      line = line.decode('UTF-8')
 # tokenize on white space
      line = line.split()
 # convert to lowercase
      line = [word.lower() for word in line]
 # remove punctuation from each token
      line = [word.translate(table) for word in line]
 # remove non-printable chars form each token
      line = [re_print.sub('', w) for w in line]
 # remove tokens with numbers in them
      line = [word for word in line if word.isalpha()]
 # store as string
      clean_pair.append(' '.join(line))
      cleaned.append(clean_pair)
  return array(cleaned)

In [None]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
 dump(sentences, open(filename, 'wb'))
 print('Saved: %s' % filename)

# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(200):
 print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[hi] => [hallo]
[hi] => [hallo]
[hi] => [gru gott]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[wow] => [donnerwetter]
[fire] => [feuer]
[fire] => [feuer]
[help] => [hilfe]
[help] => [hilfe]
[help] => [zu hulf]
[help] => [zu hulf]
[stop] => [stopp]
[stop] => [stopp]
[wait] => [warte]
[wait] => [warte]
[hello] => [hallo]
[hello] => [hallo]
[i try] => [ich probiere es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[smile] => [lacheln]
[cheers] => [zum wohl]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [verstanden]
[got it] => [einverstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er rannte]
[he ran] => [er lief]


In [None]:
# load a clean dataset
def load_clean_sentences(filename):
 return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
 dump(sentences, open(filename, 'wb'))
 print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [None]:
# fit a tokenizer
def create_tokenizer(lines):
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(lines)
 return tokenizer

In [None]:
# max sentence length
def max_length(lines):
 return max(len(line.split()) for line in lines)

In [None]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

English Vocabulary Size: 1466
English Max Length: 5
German Vocabulary Size: 2384
German Max Length: 7


In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
 # integer encode sequences
 X = tokenizer.texts_to_sequences(lines)
 # pad sequences with 0 values
 X = pad_sequences(X, maxlen=length, padding='post')
 return X

In [None]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
 ylist = list()
 for sequence in sequences:
  encoded = to_categorical(sequence, num_classes=vocab_size)
  ylist.append(encoded)
 y = array(ylist)
 y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
 return y

In [None]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [None]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
 model = Sequential()
 model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
 model.add(LSTM(n_units))
 model.add(RepeatVector(tar_timesteps))
 model.add(LSTM(n_units, return_sequences=True))
 model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
 return model

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
# summarize defined model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 7, 256)            610304    
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 repeat_vector_1 (RepeatVec  (None, 5, 256)            0         
 tor)                                                            
                                                                 
 lstm_3 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed_1 (TimeDi  (None, 5, 1466)           376762    
 stributed)                                                      
                                                                 
Total params: 2037690 (7.77 MB)
Trainable params: 2037

In [None]:
filename = 'model.h5'
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), verbose=2)

Epoch 1/30


141/141 - 13s - loss: 3.9086 - accuracy: 0.4673 - val_loss: 3.2060 - val_accuracy: 0.4794 - 13s/epoch - 96ms/step
Epoch 2/30
141/141 - 6s - loss: 3.0532 - accuracy: 0.4967 - val_loss: 2.9942 - val_accuracy: 0.5026 - 6s/epoch - 43ms/step
Epoch 3/30
141/141 - 7s - loss: 2.8880 - accuracy: 0.5128 - val_loss: 2.8699 - val_accuracy: 0.5138 - 7s/epoch - 47ms/step
Epoch 4/30
141/141 - 7s - loss: 2.7180 - accuracy: 0.5238 - val_loss: 2.7143 - val_accuracy: 0.5234 - 7s/epoch - 46ms/step
Epoch 5/30
141/141 - 7s - loss: 2.5484 - accuracy: 0.5412 - val_loss: 2.5934 - val_accuracy: 0.5400 - 7s/epoch - 47ms/step
Epoch 6/30
141/141 - 7s - loss: 2.3835 - accuracy: 0.5640 - val_loss: 2.4166 - val_accuracy: 0.5700 - 7s/epoch - 46ms/step
Epoch 7/30
141/141 - 7s - loss: 2.2037 - accuracy: 0.5907 - val_loss: 2.2608 - val_accuracy: 0.5868 - 7s/epoch - 47ms/step
Epoch 8/30
141/141 - 6s - loss: 2.0121 - accuracy: 0.6168 - val_loss: 2.0828 - val_accuracy: 0.6212 - 6s/epoch - 46ms/step
Epoch 9/30
1

<keras.src.callbacks.History at 0x1f33020eaf0>

In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
 for word, index in tokenizer.word_index.items():
  if index == integer:
    return word
 return None

In [None]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
 prediction = model.predict(source, verbose=0)[0]
 integers = [argmax(vector) for vector in prediction]
 target = list()
 for i in integers:
  word = word_for_id(i, tokenizer)
  if word is None:
    break
  target.append(word)
 return ' '.join(target)
# evaluate the skill of the model


In [None]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
 actual, predicted = list(), list()
 for i, source in enumerate(sources):
 # translate encoded source text
  source = source.reshape((1, source.shape[0]))
  translation = predict_sequence(model, eng_tokenizer, source)
  raw_target, raw_src = raw_dataset[i]
  if i <= 10:
    print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
  else:
    break;
  actual.append([raw_target.split()])
  predicted.append(translation.split())

In [None]:
-evaluate_model(model, eng_tokenizer, trainX, train)

src=[ich arbeite viel],    target=[i work a lot],     predicted=[i know a lot]
src=[er wird gehasst],    target=[he is hated],     predicted=[he is hated]
src=[sie haben gewonnen],    target=[they won],     predicted=[youve won]
src=[sie sind nicht nett],    target=[youre unkind],     predicted=[youre unkind]
src=[ich habe sie gerettet],    target=[i saved you],     predicted=[i saved you]
src=[bleiben sie ruhig],    target=[stay calm],     predicted=[stay calm]
src=[ich habe es verdient],    target=[i deserved it],     predicted=[i deserved it]
src=[tom druckt sich vage aus],    target=[tom is vague],     predicted=[tom is vague]
src=[geh wieder rein],    target=[go back inside],     predicted=[go back inside]
src=[ich liebe dich],    target=[i love you],     predicted=[i love you]
src=[du bist launisch],    target=[youre moody],     predicted=[youre moody]


In [None]:
evaluate_model(model, eng_tokenizer, testX, test)

src=[bis dann], target=[see you then], predicted=[see you then]
src=[sie sind frei], target=[theyre free], predicted=[theyre free]
src=[es ist zu schwierig], target=[its too hard], predicted=[its too hard]
src=[tom hat es versucht], target=[tom tried], predicted=[tom tried]
src=[sie mogeln], target=[they cheat], predicted=[they escaped]
src=[sie sind reich], target=[you are rich], predicted=[you are rich]
src=[ist tom krank], target=[is tom sick], predicted=[is tom sick]
src=[die liebe bleibt], target=[love lasts], predicted=[love lasts]
src=[er konnte es machen], target=[he could do it], predicted=[he could do it]
src=[es ist zu hei], target=[it is too hot], predicted=[its too hot]
src=[wir werden es versuchen], target=[well try], predicted=[well see]
