In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

import re
import sklearn
import os

import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw

In [2]:
def read_file(file_dir):
    result = []
    with open(file_dir,'r') as f:
	    for line in f:
		    result.append(line.strip('\n'))
    return result

def read_array(file_dir):
    str = []
    with open(file_dir,'r') as f:
        for line in f:
            temp = line.strip('\n').split(' ')
            str.append(np.genfromtxt(np.array(temp)))
    return np.array(str)

def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

In [3]:
def get_coefs(word, *arr):
    return word, np.asarray(arr[:len(arr)-1], dtype='float64')

def load_embedding(file):
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    return embeddings_index


def make_embedding_matrix(embedding, tokenizer, len_voc):
    all_embs = np.stack(embedding.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = tokenizer.word_index
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len_voc, embed_size))

    for word, i in word_index.items():
        if i >= len_voc:
            continue
        embedding_vector = embedding.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [4]:
def make_tokenizer(texts, len_voc):
    from keras.preprocessing.text import Tokenizer
    t = Tokenizer(num_words=len_voc)
    t.fit_on_texts(texts)
    return t

In [5]:
emb_array = load_embedding('data/bora.emb.vec')
len_voc = len(emb_array)

train_x = read_file("data/bora.train")
tokenizer = make_tokenizer(train_x, len_voc)

embed_mat = make_embedding_matrix(emb_array, tokenizer, len_voc)

Using TensorFlow backend.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
from sklearn.neighbors import NearestNeighbors

synonyms_number = 5
word_number = 2000

nn = NearestNeighbors(n_neighbors=synonyms_number + 1).fit(embed_mat)
neighbours_mat = nn.kneighbors(embed_mat[1:word_number])[1]
synonyms = {x[0]: x[1:] for x in neighbours_mat}

In [8]:
neighbors_matrix = nn.kneighbors(X = embed_mat[1:word_number], return_distance=True)

In [12]:
index_word = {0: ''}
for word in tokenizer.word_index.keys():
    index_word[tokenizer.word_index[word]] = word

In [7]:
all_distance, all_neighbors = nn.kneighbors(X = embed_mat, return_distance=True)

finish


In [42]:
index_target = []
for i in range(all_distance.shape[0]):
    if (all_distance[i][1:].mean() < 40 and len(index_word.get(i)) > 2):
        index_target.append(i)

In [47]:
neighbours_target = nn.kneighbors(embed_mat[index_target])[1]

finish


In [48]:
synonyms = {x[0]: x[1:] for x in neighbours_target}

In [53]:
# modify and write sentence

def modify_sentence(sentence, synonyms_words, p=0.5):
    for i in range(len(sentence)):
        if np.random.random() > p:
            try:
                syns = synonyms_words[sentence[i]]
                sentence[i] = np.random.choice(syns)
            except KeyError:
                pass
    return sentence

In [76]:
synonyms_words = {}
for x in index_target:
    try:
        temp = [index_word[synonyms[x][i]] for i in range(synonyms_number-1)]
        synonyms_words[index_word[x]] = list(filter(lambda x: len(x) > 2, temp))
    except KeyError:
        pass

In [99]:
with open('output/bora_embed.txt', 'a') as f:
    for x in train_x:
        modified = modify_sentence(x.split(' '), synonyms_words, p = 0)
        sentence_m = ' '.join(modified)
        f.write(sentence_m+'\n')

finish
