In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [80]:
!kaggle datasets download -d kaushal2896/english-to-german

Dataset URL: https://www.kaggle.com/datasets/kaushal2896/english-to-german
License(s): unknown
english-to-german.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
from zipfile import ZipFile
path="/content/english-to-german.zip"
with ZipFile(path,'r') as data:
  data.extractall()

In [4]:
with open('deu.txt','r') as data:
  files=data.read().split('\n')

In [5]:
for i in range(len(files)):
  files[i]=files[i].split('\t')

In [6]:
data=[]

In [7]:
files[0][1]

'Geh.'

In [8]:
files=files[:221533]

In [9]:
for i in range(len(files)):
  a=[]
  a.append(files[i][0])
  a.append(files[i][1])
  data.append(a)

In [10]:
data[0]

['Go.', 'Geh.']

In [11]:
import pandas as pd

In [12]:
table=pd.DataFrame(data,columns=['english','german'])

In [13]:
table.head()

Unnamed: 0,english,german
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,Grüß Gott!
3,Run!,Lauf!
4,Run.,Lauf!


In [14]:
len(table)

221533

In [15]:
table=table.head(250000)

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [17]:
len(table)

221533

In [18]:
table['german']=table['german'].apply(lambda x:'<start> '+x+' <end>')

In [19]:
table.head()

Unnamed: 0,english,german
0,Go.,<start> Geh. <end>
1,Hi.,<start> Hallo! <end>
2,Hi.,<start> Grüß Gott! <end>
3,Run!,<start> Lauf! <end>
4,Run.,<start> Lauf! <end>


In [20]:
eng_token=Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~')
ger_token=Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~')

In [21]:
eng_token.fit_on_texts(table['english'])
ger_token.fit_on_texts(table['german'])

In [22]:
eng_seq=eng_token.texts_to_sequences(table['english'])
ger_seq=ger_token.texts_to_sequences(table['german'])

In [23]:
max_eng=max([len(x) for x in eng_seq])
max_ger=max([len(x) for x in ger_seq])

In [24]:
eng_pad_seq=pad_sequences(eng_seq,maxlen=max_eng,padding='post')
ger_pad_seq=pad_sequences(ger_seq,maxlen=max_ger,padding='post')

In [25]:
eng_pad_seq.shape,ger_pad_seq.shape

((221533, 101), (221533, 77))

In [26]:
X1=eng_pad_seq

In [27]:
X2=ger_pad_seq.reshape(-1,max_ger,1)[:,:-1:]

In [28]:
y=ger_pad_seq.reshape(-1,max_ger,1)[:,1:,:]

In [29]:
X1.shape,X2.shape,y.shape

((221533, 101), (221533, 76, 1), (221533, 76, 1))

In [30]:
from keras.layers import LSTM,Embedding,Input,Dense
from keras import Model

In [31]:
encoder_input=Input(shape=(None,))
encoder_embedding=Embedding(len(eng_token.word_index)+1,200,mask_zero=True)
encoder_embed=encoder_embedding(encoder_input)
encoder_lstm=LSTM(256,return_state=True)
_,enc_h,enc_c=encoder_lstm(encoder_embed)
states=[enc_h,enc_c]

decoder_input=Input(shape=(None,1))
decoder_lstm=LSTM(256,return_state=True,return_sequences=True)
value,_,_=decoder_lstm(decoder_input,initial_state=states)
decoder_dense=Dense(len(ger_token.word_index)+1,activation='softmax')
decoder_output=decoder_dense(value)

model=Model([encoder_input,decoder_input],decoder_output)

model.compile('adam','sparse_categorical_crossentropy',['accuracy'])

In [32]:
model.fit([X1,X2],y,epochs=5,validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7bf900721900>

In [33]:
encoder_model=Model(encoder_input,states)

In [34]:
input_states=[Input(shape=(256,)),Input(shape=(256,))]

logits,h2,c2=decoder_lstm(decoder_input,initial_state=input_states)
out=decoder_dense(logits)
new_states=[h2,c2]

decoder_model=Model([decoder_input]+input_states,[out]+new_states)

In [35]:
import numpy as np

In [53]:
def translator(sentence):
  sentence=eng_token.texts_to_sequences([sentence])
  sentence=pad_sequences(sentence,maxlen=max_eng,padding='post')
  sent_states=encoder_model.predict(sentence)
  condition=True
  translate=[]
  start="<start>"
  token=np.zeros((1,1,1))
  token[0,0,0]=ger_token.word_index[start]
  while condition:
    trans,hid,cell=decoder_model.predict([token]+sent_states)
    ob_idx=np.argmax(trans[0,0,:])
    ob_word=ger_token.index_word[ob_idx]
    translate.append(ob_word)
    token[0,0,0]=ob_idx
    sent_states=[hid,cell]
    if ob_word=='<end>'or len(translate)==max_ger:
      condition=False
  return " ".join(translate[:-1])

In [55]:
table.head(100000)

Unnamed: 0,english,german
0,Go.,<start> Geh. <end>
1,Hi.,<start> Hallo! <end>
2,Hi.,<start> Grüß Gott! <end>
3,Run!,<start> Lauf! <end>
4,Run.,<start> Lauf! <end>
...,...,...
99995,This isn't going to be good.,<start> Das wird nicht gut sein. <end>
99996,This job is too much for me.,<start> Diese Arbeit ist zu viel für mich. <end>
99997,This job is too much for me.,<start> Diese Arbeit überfordert mich. <end>
99998,This job is too much for me.,<start> Diese Aufgabe überfordert mich. <end>


In [39]:
import pickle

In [40]:
pickle.dump(encoder_model,open('enger_encoder.pkl','wb'))
pickle.dump(decoder_model,open('enger_decoder.pkl','wb'))
pickle.dump(eng_token,open('enger_engtok.pkl','wb'))
pickle.dump(ger_token,open('enger_gertok.pkl','wb'))

In [64]:
pickle.dump(translator,open('enger_translator.pkl','wb'))

In [56]:
translator('Go')



'geh'

In [57]:
translator('run')



'lauf'

In [58]:
translator('Hi')



'hallo'

In [59]:
translator('Tom was crying.')



'tom weinte'

In [60]:
translator('Tom was fuming.')



'tom war wütend'

In [61]:
translator("This isn't going to be good")



'das wird nicht gut sein'

In [62]:
translator("This knife doesn't cut well.")



'dieses messer schneidet nicht gut'

In [63]:
translator("how are you?")



'wie geht es'

In [70]:
translator('What is your name?')



'wie heißt ihr name'

In [71]:
translator('Where are you from?')



'woher kommst du'

In [72]:
translator('How old are you?')



'wie alt bist du'

In [73]:
translator("Where can I find a good restaurant?")



'wo kann ich ein gutes restaurant restaurant'

In [74]:
translator("Today is a beautiful day.")



'heute ist ein schöner tag'

In [75]:
translator("What time is it?")



'wie spät ist es'

In [76]:
translator("I need directions to the nearest train station.")



'ich brauche an der nächsten des bahnhof bahnhof'

In [77]:
translator("Can you help me with this?")



'kannst du mir dabei helfen'

In [78]:
translator("I love learning languages.")



'ich lerne gerne sprachen'

In [81]:
translator("What is your favorite food?")



'was ist dein lieblingsessen'

In [82]:
translator('Could you please pass me the salt?')



'könnten du mir bitte das salz reichen'

In [83]:
translator('Are you coming to the party tonight?')



'kommst du heute abend zu der nacht'

In [85]:
translator("Can you teach me how to cook?")



'kannst du mir das kochen beibringen'