In [1]:
import gensim
import pandas as pd
import numpy as np
import pickle

In [2]:
# read the train.csv
df = pd.read_csv('train.csv')

In [3]:
# find the unique english word
eng_unique = set()
for sentence in df.eng:
    for word in sentence.split():
        eng_unique.add(word)

In [4]:
len(eng_unique) # print the number of english words

3707

In [5]:
# find the unique hindi words
hin_unique = set()
for sentence in df.hin:
    for word in sentence.split():
        hin_unique.add(word)

In [6]:
len(hin_unique) # print the numbers of hindi words

4954

In [7]:
# load the top 200000 word2vec pretrained embeddings
model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec/model.txt', binary= False, limit=200000)

In [8]:
# tokenize the english words and create a dictionary mapping token to embedding, if a word is not found remember its count and save it in the dictionary initialized with 0 vector of 300 dimension
eng_embedding = {}
eng_not_found = 0
for indx, word in enumerate(eng_unique):
    if word in model:
        eng_embedding[word] = indx, model[word]
    else:
        eng_embedding[word] = indx, np.zeros(300)
        eng_not_found += 1
print(eng_not_found)

1382


In [9]:
# load the top 200000 fasttext embeddings in hindi
hindi_model = gensim.models.KeyedVectors.load_word2vec_format('cc.hi.300.vec', binary=False, limit=200000)

In [10]:
# tokenize the hindi words and create a dictionary mapping token to embedding, if a word is not found remember its count and save it in the dictionary initialized with 0 vector of 300 dimension
hin_embedding = {}
hin_not_found = 0
for indx, word in enumerate(hin_unique):
    if word in hindi_model:
        hin_embedding[word] = indx, hindi_model[word]
    else:
        hin_embedding[word] = indx, np.zeros(300)
        hin_not_found += 1
print(hin_not_found)

1385


In [11]:
# dump the embeddings
with open('english_model.pkl', 'wb') as f:
    pickle.dump(eng_embedding, f)
with open('hindi_model.pkl', 'wb') as f:
    pickle.dump(hin_embedding, f)