In [1]:
import numpy as np
import pandas as pd
import tensorflow
import keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense,TimeDistributed,Embedding,Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from string import digits
import re
import string

In [3]:
df = pd.read_csv("hindi_english_parallel.csv", on_bad_lines='skip')
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [4]:
df.shape

(128609, 2)

In [5]:
df.isna().sum()

hindi      3
english    1
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df.isna().sum()

hindi      0
english    0
dtype: int64

### Preprocessing

In [8]:
# lowercasing
for col in df.columns:
    df[col] = df[col].apply(lambda x: str(x))
    df[col] = df[col].apply(lambda x: x.lower())
print("data converted to lowercase")
print("======================================")

# removing quotes
df['english'] = df['english'].apply(lambda x: re.sub("'", '', x))
df['hindi'] = df['hindi'].apply(lambda x: re.sub("'", '', x))
print("data is now free of any qoutes")
print("sample:", df["english"][0])
print("======================================")

# removing all the special characters
punc = set(string.punctuation)
punc.remove('?')
punc.remove("'")
print("fixing punctuation...")
print(punc)
df['english'] = df['english'].apply(lambda x: ''.join(y for y in x if y not in punc))
df['hindi'] = df['hindi'].apply(lambda x: ''.join(y for y in x if y not in punc))
print("all the special characters have been removed!")
print("======================================")

# removing digits
digits = "1234567890"
remove_digits = str.maketrans('','',digits)
print("removing digits...")
print(remove_digits)
df['english'] = df['english'].apply(lambda x: x.translate(remove_digits))
df['hindi'] = df['hindi'].apply(lambda x: x.translate(remove_digits))
df['hindi'] = df['hindi'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))
print("all the digits have been removed!")
print("======================================")

# removing extra spaces
df['english'] = df['english'].apply(lambda x: x.strip())
df['hindi'] = df['hindi'].apply(lambda x: x.strip())
df['english'] = df['english'].apply(lambda x: re.sub(" +", " ", x))
df['hindi'] = df['hindi'].apply(lambda x: re.sub(" +", " ", x))
print("extra spaces have been removed!")
print("======================================")

print("clean data:")
df.head(51)

data converted to lowercase
data is now free of any qoutes
sample: give your application an accessibility workout
fixing punctuation...
{'[', '<', '_', '{', '&', ':', '$', '`', '}', '*', ']', '+', '%', '=', '-', '/', '#', '>', '?', ')', '!', '(', ',', '\\', '.', '@', "'", '|', '~', '^', '"', ';'}
all the special characters have been removed!
removing digits...
{49: None, 50: None, 51: None, 52: None, 53: None, 54: None, 55: None, 56: None, 57: None, 48: None}
all the digits have been removed!
extra spaces have been removed!
clean data:


Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,accerciser accessibility explorer
2,निचले पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the top panel
4,उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष...,a list of plugins that are disabled by default
5,अवधि को हाइलाइट रकें,highlight duration
6,पहुंचनीय आसंधि नोड को चुनते समय हाइलाइट बक्से ...,the duration of the highlight box when selecti...
7,सीमांत बोर्डर के रंग को हाइलाइट करें,highlight border color
8,हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता।,the color and opacity of the highlight border
9,भराई के रंग को हाइलाइट करें,highlight fill color


### Adding start and end tokens to target sequences

In [9]:
df['hindi'] = df['hindi'].apply(lambda x : 'START_ '+ x + ' _END')
df.head()

Unnamed: 0,hindi,english
0,START_ अपने अनुप्रयोग को पहुंचनीयता व्यायाम का...,give your application an accessibility workout
1,START_ एक्सेर्साइसर पहुंचनीयता अन्वेषक _END,accerciser accessibility explorer
2,START_ निचले पटल के लिए डिफोल्ट प्लगइन खाका _END,the default plugin layout for the bottom panel
3,START_ ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका _END,the default plugin layout for the top panel
4,START_ उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप...,a list of plugins that are disabled by default


### Generating dictionaries

In [10]:
### Get English and Hindi Vocabulary
all_eng_words = set()
for eng in df['english']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hin_words = set()
for hin in df['hindi']:
    for word in hin.split():
        if word not in all_hin_words:
            all_hin_words.add(word)

In [11]:
all_eng_words

{'libreville',
 'reaching',
 'eagerly',
 'completing',
 'toc',
 'non',
 'authorise',
 'sharing',
 'keyring',
 'program',
 'distribution',
 'data',
 'location',
 'lunch',
 'addresses',
 'redeal',
 'des',
 'spouse',
 'emitted',
 'expanded',
 'accessible',
 'highlight',
 'mage',
 'grant',
 'chagos',
 'pushing',
 'gjs',
 'genisoimage',
 'oberon',
 'autowrap',
 'tag…',
 'optimizations',
 '“ftp”',
 'once',
 'add…',
 'ypertext',
 'stored',
 'tree',
 'improve',
 'opaque',
 'exceeds',
 'nauru',
 'pixmap',
 'blinking',
 'sticky',
 'selecting',
 'dot',
 'unlocked',
 'udp',
 'prompted',
 'claim',
 'group…',
 'viewer',
 'loudmouth',
 'css',
 'licensed',
 'username',
 'attributes',
 'prefer',
 'novosibirsk',
 'revise',
 'cookies',
 'initial',
 'making',
 'fetching',
 'asking',
 'underlined',
 'early',
 'block',
 'db',
 'cast',
 'oaf',
 'similarly',
 'jacks',
 'whiteness',
 'variable…',
 'guayaquil',
 'resize',
 'unlink',
 'view',
 'highlights',
 'hotkey',
 'develop',
 'inch',
 'lassification',
 'cal

In [12]:
all_hin_words

{'बैठकnew',
 'पाएंगे',
 'toc',
 'चौदह',
 'स्केन',
 'भेंजेः',
 'समरूपता',
 'फ़ाइलनिर्देशिका',
 'patchdiff',
 'अभिव्यक्त',
 'लाएँः',
 'अमेरिकामिक्वेलन',
 'छोड़ना',
 'program',
 'अवयव',
 'राज्यः',
 'distribution',
 'चुनेंdisc',
 'सुविधा',
 'data',
 'अफ्रीकालुसाका',
 'location',
 'डेटा',
 'मिस्ट',
 'रोडेट',
 'सीमांकक',
 'बटनsu',
 'बार्डर',
 'मौज़ूद',
 'ध्वनि',
 'सदा',
 'समझ',
 'अफ्रीकालागोस',
 'पन्ने',
 'सताने',
 'विलुप्त',
 'परीक्षण',
 'कीजिए',
 'नेटवर्किंग',
 'पश्चिमी',
 'लाने',
 'राजा',
 'सर्वर',
 'मार्ग',
 'genisoimage',
 'आधारों',
 'oberon',
 '“ftp”',
 'पाएगा',
 'एशियाकाबुल',
 'सृजन',
 'अग्रसारित',
 'डेवलेपर',
 'मित्रlevel',
 'जायेगे',
 'हाँ',
 'सदस्योंः',
 'इमेलshow',
 'अछर',
 'applicablenot',
 'pixmap',
 'निवेदितः',
 'मूलाधार',
 'बातचीत',
 'udp',
 'ट्रैकिंग',
 'विन्यासः',
 'एटलांटिकसेंट',
 'ही',
 'पैकेजः',
 'मॉडल',
 'बोगोफिल्टर',
 'xmaccyrillic',
 'css',
 'licensed',
 'उदाहरणuser',
 'एस्लेरियाट',
 'पड़ना',
 'नियमानुसार',
 'डेमो',
 'अबnone',
 'डालने',
 'वाई',
 'बेमेल',
 'रिसाव',
 'जै

### Adding features to data

In [15]:
df['length_eng'] = df['english'].apply(lambda x:len(x.split(" ")))
df['length_hin'] = df['hindi'].apply(lambda x:len(x.split(" ")))

In [16]:
df.head()

Unnamed: 0,hindi,english,length_eng,length_hin
0,START_ अपने अनुप्रयोग को पहुंचनीयता व्यायाम का...,give your application an accessibility workout,6,10
1,START_ एक्सेर्साइसर पहुंचनीयता अन्वेषक _END,accerciser accessibility explorer,3,5
2,START_ निचले पटल के लिए डिफोल्ट प्लगइन खाका _END,the default plugin layout for the bottom panel,8,9
3,START_ ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका _END,the default plugin layout for the top panel,8,9
4,START_ उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप...,a list of plugins that are disabled by default,9,14


In [17]:
df.describe()

Unnamed: 0,length_eng,length_hin
count,128605.0,128605.0
mean,4.540679,7.054477
std,5.277197,5.814052
min,1.0,3.0
25%,2.0,4.0
50%,3.0,5.0
75%,6.0,8.0
max,134.0,117.0


In [18]:
print(df[df['length_eng']<=20].shape)
print(df.shape)

(126231, 4)
(128605, 4)


In [19]:
## Max length is 291 and 215 for eng and hindi and avg is 2,4.
df = df[df['length_eng']<=20]
df = df[df['length_hin']<=20]

In [20]:
max_length_src=max(df['length_hin'])
max_length_tar=max(df['length_eng'])
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hin_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hin_words)

In [21]:
num_decoder_tokens

8552

In [22]:
num_decoder_tokens += 1
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [23]:
input_token_index

{'a': 1,
 'aaiun': 2,
 'ababa': 3,
 'abbreviations': 4,
 'abbrevweekdayname': 5,
 'abc': 6,
 'abcdefghijk': 7,
 'abidjan': 8,
 'ability': 9,
 'able': 10,
 'abnormal': 11,
 'abort': 12,
 'aborted': 13,
 'aborting': 14,
 'about': 15,
 'about…': 16,
 'above': 17,
 'absolute': 18,
 'abstraction': 19,
 'ac': 20,
 'acce': 21,
 'accelerated': 22,
 'acceleration': 23,
 'accelerator': 24,
 'accelerator…': 25,
 'accentuated': 26,
 'accept': 27,
 'acceptable': 28,
 'accepted': 29,
 'accepting': 30,
 'accerciser': 31,
 'access': 32,
 'accessed': 33,
 'accessibility': 34,
 'accessible': 35,
 'accessibles': 36,
 'accessing': 37,
 'accidents': 38,
 'acco': 39,
 'according': 40,
 'accordion': 41,
 'account': 42,
 'accountapos': 43,
 'accounts': 44,
 'accra': 45,
 'accuracy': 46,
 'accurate': 47,
 'ace': 48,
 'aces': 49,
 'across': 50,
 'act': 51,
 'acti': 52,
 'acting': 53,
 'action': 54,
 'actionable': 55,
 'actions': 56,
 'actionscript': 57,
 'activatable': 58,
 'activate': 59,
 'activated': 60,
 'a

In [24]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
reverse_input_char_index

{1: 'a',
 2: 'aaiun',
 3: 'ababa',
 4: 'abbreviations',
 5: 'abbrevweekdayname',
 6: 'abc',
 7: 'abcdefghijk',
 8: 'abidjan',
 9: 'ability',
 10: 'able',
 11: 'abnormal',
 12: 'abort',
 13: 'aborted',
 14: 'aborting',
 15: 'about',
 16: 'about…',
 17: 'above',
 18: 'absolute',
 19: 'abstraction',
 20: 'ac',
 21: 'acce',
 22: 'accelerated',
 23: 'acceleration',
 24: 'accelerator',
 25: 'accelerator…',
 26: 'accentuated',
 27: 'accept',
 28: 'acceptable',
 29: 'accepted',
 30: 'accepting',
 31: 'accerciser',
 32: 'access',
 33: 'accessed',
 34: 'accessibility',
 35: 'accessible',
 36: 'accessibles',
 37: 'accessing',
 38: 'accidents',
 39: 'acco',
 40: 'according',
 41: 'accordion',
 42: 'account',
 43: 'accountapos',
 44: 'accounts',
 45: 'accra',
 46: 'accuracy',
 47: 'accurate',
 48: 'ace',
 49: 'aces',
 50: 'across',
 51: 'act',
 52: 'acti',
 53: 'acting',
 54: 'action',
 55: 'actionable',
 56: 'actions',
 57: 'actionscript',
 58: 'activatable',
 59: 'activate',
 60: 'activated',
 61

### Splitting data

In [25]:
X = df['english']
y = df['hindi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
print("data has been split!")

data has been split!


In [26]:
encoder_input_data = np.zeros((2, max_length_src),dtype='float32')
decoder_input_data = np.zeros((2, max_length_tar),dtype='float32')
decoder_target_data = np.zeros((2, max_length_tar, num_decoder_tokens),dtype='float32')

In [27]:
def generate_batch(X = X_train, y = y_train, batch_size = 64):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [28]:
latent_dim = 300
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens+1, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [29]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens+1, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [30]:
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])


In [31]:
model.summary()
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 64
epochs = 32


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            1726800   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            2566200   ['input_2[0][0]']             
                                                                                              

In [32]:
a, b = next(generate_batch())

In [33]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define callback functions
early_stopping = EarlyStopping(monitor='val_loss', patience=6, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, min_lr=1e-7)

model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples/batch_size,
                    epochs=32,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples/batch_size,
                    callbacks=[early_stopping, reduce_lr])

  model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),


Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 18: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 22: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.src.callbacks.History at 0x7dc44ab7a2c0>

In [None]:
# model.save('eng-to-hindi.h5')

In [99]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [44]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [70]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

### Prediction

In [108]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
print(input_seq)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

[[3156.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.]]
Input English sentence: modified
Actual Hindi Translation:  रूपांतरक 
Predicted Hindi Translation:  परिवर्धितः 


In [103]:
inp = "next important message"
input_text = inp.lower()
input_seq = [input_token_index[word] for word in input_text.split()]
input_seq = np.array(input_seq)
input_seq = input_seq.reshape(1, -1, 1)
decoded_sent = decode_sequence(input_seq)
print(decoded_sent[:-4])

[3301, 2391, 3075]
[3301 2391 3075]
[[[3301]
  [2391]
  [3075]]]
 अगला महत्वपूर्ण संदेश i 


In [110]:
inp = "unread messages"
input_text = inp.lower()
input_seq = [input_token_index[word] for word in input_text.split()]
print(input_seq)
input_seq = np.array(input_seq)
print(input_seq)
input_seq = input_seq.reshape(1, -1, 1)
print(input_seq)
decoded_sent = decode_sequence(input_seq)
print(decoded_sent[:-4])

[5325, 3076]
[5325 3076]
[[[5325]
  [3076]]]
 अपठित संदेश 


In [106]:
inp = "give your application"
input_text = inp.lower()
input_seq = [input_token_index[word] for word in input_text.split()]

# Pad the input_seq to length 20 with zeros
max_length = 20
if len(input_seq) < max_length:
    input_seq = input_seq + [0] * (max_length - len(input_seq))

print(input_seq)
input_seq = np.array(input_seq, dtype=float).reshape(1, 20)
print(input_seq)

decoded_sent = decode_sequence(input_seq)
print(decoded_sent[:-5])


[2087, 5711, 241, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[2087. 5711.  241.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.]]
 अपने अनुप्रयोग को कान्फिगर करो


In [112]:
# Save the encoder model
encoder_model.save("encoder_model.keras")

# Save the decoder model
decoder_model.save("decoder_model.keras")


In [114]:
import pickle

file_name = "input_token_index.pkl"

# Use pickle.dump() to save the dictionary to the file
with open(file_name, "wb") as file:
    pickle.dump(input_token_index, file)

In [115]:
file_name = "target_token_index.pkl"

# Use pickle.dump() to save the dictionary to the file
with open(file_name, "wb") as file:
    pickle.dump(target_token_index, file)

file_name = "reverse_target_char_index.pkl"

# Use pickle.dump() to save the dictionary to the file
with open(file_name, "wb") as file:
    pickle.dump(reverse_target_char_index, file)