<a href="https://colab.research.google.com/github/Aniket23160/Language-Translator-German-English-/blob/main/Language_translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = '/content/drive/My Drive/Language Translator/deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, '/content/drive/My Drive/Language Translator/english-german.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: /content/drive/My Drive/Language Translator/english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[go on] => [mach weiter]
[hello] => [hallo]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[attack] => [angriff]
[attack] => [attacke]
[cheers] => [zum wohl]
[eat it] => [iss es]
[eat up] => [iss auf]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [aha]
[got it] => [ich habs]
[got it] => [kapiert]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran]

In [None]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, '/content/drive/My Drive/Language Translator/english-german-both.pkl')
save_clean_data(train, '/content/drive/My Drive/Language Translator/english-german-train.pkl')
save_clean_data(test, '/content/drive/My Drive/Language Translator/english-german-test.pkl')

Saved: /content/drive/My Drive/Language Translator/english-german-both.pkl
Saved: /content/drive/My Drive/Language Translator/english-german-train.pkl
Saved: /content/drive/My Drive/Language Translator/english-german-test.pkl


In [None]:
import nltk
from keras.preprocessing.text import Tokenizer

In [None]:
import nltk
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-both.pkl')
train = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-train.pkl')
test = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())
plot_model(model, to_file='/content/drive/My Drive/Language Translator/model.png', show_shapes=True)
# fit model
filename = '/content/drive/My Drive/Language Translator/model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

English Vocabulary Size: 2214
English Max Length: 5
German Vocabulary Size: 3526
German Max Length: 9
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 9, 256)            902656    
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 5, 2214)           568998    
Total params: 2,522,278
Trainable params: 2,522,278
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 26s - loss: 4.1494 - val_loss: 3.3696

Epoch 00001: val_loss improved from inf to 3.36957, saving model to /content/drive/My Drive/Language Translator/model.h5
Epoch 2/30
 - 25s - loss: 3.2307 - val_loss: 3.2336

Epoch 00002: val_loss improved from 3.36957 to 3.23362, saving model to /content/drive/My Drive/Language Translator/model.h5
Epoch 3/30
 - 25s - loss: 3.1002 - val_loss: 3.1763

Epoch 00003: val_loss improved from 3.23362 to 3.17635, saving model to /content/drive/My Drive/Language Translator/model.h5
Epoch 4/30
 - 25s - loss: 2.9728 - val_loss: 3.0357

Epoch 00004: val_loss improved from 3.17635 to 3.03574, saving model to /content/drive/My Drive/Language Translator/model.h5
Epoch 5/30
 - 25s - loss: 2.7893 - val_loss: 2.8877

Epoch 00005: val_loss improved from 3.03574 to 2.88766, saving model to /content/drive/My Drive/Language Translator/model.h5
Epoch 6/30
 - 25s - loss: 2.6144 - val_loss: 2.7712

Epoch 00006: 

<keras.callbacks.callbacks.History at 0x7f55f1097eb8>

In [None]:

...
# load datasets
dataset = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-both.pkl')
train = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-train.pkl')
test = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [None]:
# load model
from keras.models import load_model
model = load_model('/content/drive/My Drive/Language Translator/model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
import ast

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# # evaluate the skill of the model
# def evaluate_model(model, tokenizer, sources, raw_dataset):
# 	# print("a\n")
# 	actual, predicted = list(), list()
    
# 	for i, source in enumerate(sources):
# 		# print("a\n")
# 		# translate encoded source text
# 		source = source.reshape((1, source.shape[0]))
    
# 		translation = predict_sequence(model, eng_tokenizer, source)
# 		raw_target = raw_dataset[i]
# 		raw_src = raw_dataset[i]
	  
# 		if i < 10:
# 			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
# 		actual.append([raw_target])
# 		predicted.append(translation.split())
# 	# calculate BLEU score
# 	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
# 	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
# 	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
# 	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# # load datasets
# dataset = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-both.pkl')
# train = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-train.pkl')
# test = load_clean_sentences('/content/drive/My Drive/Language Translator/english-german-test.pkl')
# # prepare english tokenizer
# eng_tokenizer = create_tokenizer(dataset[:, 0])
# eng_vocab_size = len(eng_tokenizer.word_index) + 1
# eng_length = max_length(dataset[:, 0])
# # prepare german tokenizer
# ger_tokenizer = create_tokenizer(dataset[:, 1])
# ger_vocab_size = len(ger_tokenizer.word_index) + 1
# ger_length = max_length(dataset[:, 1])
# # prepare data
# trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
# testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# # load model
# model = load_model('/content/drive/My Drive/Language Translator/model.h5')
# # test on some training sequences
# print('train')
# evaluate_model(model, eng_tokenizer, trainX, train)
# # # test on some test sequences
# print('test')
# evaluate_model(model, eng_tokenizer, testX, test)

In [None]:

ger_length=9
testt=['wir rufen dich an']
testxt=encode_sequences(ger_tokenizer, ger_length, testt)
print(testxt)
preds = model.predict_classes(testxt.reshape((testxt.shape[0],testxt.shape[1])))


[[  8 354  20  30   0   0   0   0   0]]


In [None]:
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

preds_text = []
for i in preds:
# for i==0:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j], eng_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))
print(testt)
print(preds_text)
	    

['wir rufen dich an']
['well call you  ']


In [None]:

ger_length=9
testt=[input()]
testxt=encode_sequences(ger_tokenizer, ger_length, testt)

preds = model.predict_classes(testxt.reshape((testxt.shape[0],testxt.shape[1])))

def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

preds_text = []
for i in preds:
# for i==0:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j], eng_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))
print("Your sentence: ", testt[0])
print("Translated Sentence: ",preds_text[0])
	    


wir rufen dich an
Your sentence:  wir rufen dich an
Translated Sentence:  well call you  
