In [39]:
import datasets
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torchtext
from torchtext.data.utils import get_tokenizer
from nltk.util import ngrams
import random
import inltk
# from inltk.inltk import setup
# from inltk.inltk import tokenize as tokenize_hi
from indicnlp.tokenize import indic_tokenize

In [40]:
opus = datasets.load_dataset('opus100', 'en-hi', split='train', cache_dir='../data/OPUS/')

Found cached dataset opus100 (/home/shreya/Shreya/College/Sem6/NLP/Code-mix/temp/../data/OPUS/opus100/en-hi/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


In [169]:
print("Size of dataset: ", len(opus))

Size of dataset:  534319


In [172]:
print(f"Entry in OPUS: {opus[0]}")

Entry in OPUS: {'translation': {'en': 'Other, Private Use', 'hi': 'अन्य, निज़ी उपयोग'}}


In [130]:
class MixDataset(Dataset):
    def __init__(self, opus_data) :
        self.tokenizer_en = get_tokenizer('basic_english')
        # self.tokenizer_hi = get_tokenizer('indic_tokenize', language='hi')
        self.opus_data = opus_data
 
    
    def __len__(self):
        return len(self.opus_data)
    
    
    def __getitem__(self, idx):
        en, hi =  self.opus_data[idx]['translation']['en'], self.opus_data[idx]['translation']['hi']
        en = self.tokenizer_en(en)
        hi = indic_tokenize.trivial_tokenize_indic(hi)
        bi_grams_en = ngrams(en, 2)
        bi_grams_hi = ngrams(hi, 2)
        bi_en, bi_hi = [], []
        for w1, w2 in bi_grams_en :
            bi_en.append(w1 + '_' + w2)
        for w1, w2 in bi_grams_hi :
            bi_hi.append(w1 + '_' + w2)
        ret = en + hi + bi_en + bi_hi
        print(ret)
        random.shuffle(ret)
        print(ret)
        return ret
        
        

In [135]:
ds = MixDataset(opus)
dl = DataLoader(ds, batch_size=1, shuffle=True)

In [57]:
# text = 'अन्य, निज़ी उपयोग'
# indic_tokenize.trivial_tokenize_indic(text)

# st = "Other, Private Use"
# tokenizer = get_tokenizer('basic_english')
# st_tok = tokenizer(st)
# bigrams = ngrams(st_tok, 2)
# list(bigrams)

In [69]:
from gensim.models import Word2Vec, KeyedVectors

In [59]:
# txt = [['अन्य', 'निज़ी', 'उपयोग'], ['Other', 'Private', 'Use']]
model = Word2Vec(sentences=dl, vector_size=100, window=5, min_count=1, workers=8)

In [62]:
model.train(dl, total_examples=len(opus), epochs=10)

(282331604, 301995180)

In [63]:
word_vectors = model.wv
word_vectors.save("../models/word2vec.wordvectors")

In [165]:
def load_wvs(path='../models/word2vec.wordvectors'):
    return KeyedVectors.load(path, mmap='r')

In [139]:

word_vectors.index_to_key[:10]

[('.',),
 (',',),
 ('the',),
 ('(',),
 (')',),
 ('है',),
 ('और',),
 ('and',),
 ('के',),
 ('to',)]

In [144]:
print(word_vectors.index_to_key[7])
word_vectors.get_vector(word_vectors.index_to_key[7])

('and',)


array([ 0.41233677,  0.1988115 ,  0.6786996 , -0.180497  ,  0.82508934,
        0.7571229 , -1.4232038 , -1.0387919 , -0.20736161,  0.24093974,
        0.14588594, -1.1100832 , -1.5298404 , -0.39773425,  0.48344457,
       -0.01060129,  0.706726  ,  0.7970445 ,  0.25171965, -0.57115126,
       -0.1276712 , -0.34812385, -0.41411856, -0.07656819, -0.6181094 ,
        0.83155215,  0.10770226, -0.45943722,  0.6202324 ,  0.4555967 ,
        0.91622734,  0.9224759 , -1.0649984 , -0.02858173,  0.3213145 ,
       -0.32913533, -0.48386213, -0.43237954,  0.89069587, -0.2109292 ,
        1.2732015 ,  0.423429  , -0.26262724,  0.49468172, -0.45935556,
        0.14522399,  0.14959668,  0.4862668 , -1.5445775 ,  0.96105987,
       -0.15001087, -0.29731387,  0.58272845,  0.68863237, -0.56848514,
        0.64961565,  0.2711853 , -0.13970982,  0.10464289,  1.2039775 ,
        0.83498794, -1.077584  ,  0.08991605,  0.11904134,  0.07783502,
       -0.44455525, -1.334287  ,  0.6559434 , -0.9875546 , -0.39

In [151]:
from sklearn.metrics.pairwise import cosine_similarity

simi = cosine_similarity([word_vectors.get_vector(word_vectors.index_to_key[7])], [word_vectors.get_vector(word_vectors.index_to_key[6])])[0, 0]
print(f'Similarity between {word_vectors.index_to_key[7]} and {word_vectors.index_to_key[6]} is {simi}')

simi2 = cosine_similarity([word_vectors.get_vector(word_vectors.index_to_key[6])], [word_vectors.get_vector(word_vectors.index_to_key[547])])[0, 0]
print(f'Similarity between {word_vectors.index_to_key[6]} and {word_vectors.index_to_key[547]} is {simi2}')


Similarity between ('and',) and ('और',) is 0.9570193886756897
Similarity between ('और',) and ('fire',) is 0.2974220812320709


In [164]:
def most_similar(word, topn=10):
    return word_vectors.most_similar(word, topn=topn)

print("10 most similar words to '" + word_vectors.index_to_key[6][0] + "' :")
most_similar(word_vectors.get_vector(word_vectors.index_to_key[6]))

10 most similar words to 'और' :


[(('और',), 1.0),
 ((',_and',), 0.9642396569252014),
 (('and',), 0.9570194482803345),
 (('._and',), 0.8048928380012512),
 (('है_और',), 0.7780988812446594),
 (('and_the',), 0.7698213458061218),
 ((')_और',), 0.6910834908485413),
 (('and_to',), 0.682357907295227),
 (('who',), 0.6722629070281982),
 (('भी',), 0.6722161173820496)]

In [167]:
print("10 most similar words to '" + word_vectors.index_to_key[10][0] + "' :")
most_similar(word_vectors.get_vector(word_vectors.index_to_key[10]))

10 most similar words to 'of' :


[(('of',), 1.0),
 (('of_the',), 0.9338359236717224),
 (('the',), 0.879870593547821),
 (('की',), 0.7434549927711487),
 (('से',), 0.7245144844055176),
 (('का',), 0.7084817886352539),
 (('में_से',), 0.6923059821128845),
 (('के',), 0.6758306622505188),
 (('._the',), 0.6551926732063293),
 (('and',), 0.6516838073730469)]

In [168]:
def gen_cmsentence(en_sentence, word_vectors, threshold=0.3) :
    tokenizer_en = get_tokenizer('basic_english')
    sent_en =  tokenizer_en(en_sentence)
    sent_cm = []
    for i in range(len(sent_en)) :
        sent_en[i] = most_similar(sent_en[i])[0][0]
        replace = random.random() < threshold
        if not replace :
            sent_cm.append(sent_en[i])
        else: 
            sent_cm.append(most_similar(sent_en[i])[1][0])
        