### Imports

In [1]:
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_embedding import BertEmbedding
from allennlp.commands.elmo import ElmoEmbedder
from sklearn.model_selection import train_test_split

from transformers import *
import torch
import keras

import imp, gzip
import pickle, nltk
import gensim
import multiprocessing
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils as my_utils

Using TensorFlow backend.


### Definitions

In [2]:
def get_edges(i):
    t = np.where(i>0)[0]
    comb = combinations(t, 2)
    embeds = {j:[] for j in t}

    for p, q in comb:
        if word_similarity[p][q]:
            embeds[p] += [q]
            embeds[q] += [p]
    return embeds

### Config

In [3]:
dataset_name = "amazon_home_20000"

min_df = 5
max_df = .5
max_features = 50000
cutoffs = [0.3, 0.6]

n_cores = 40

### Start

In [4]:
dataset = pd.read_pickle("datasets/" + dataset_name + "_dataset")

In [5]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [6]:
wordOccurenceMatrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()


In [7]:
barren = np.where(wordOccurenceMatrix.sum(1)<=1)[0]

In [8]:
barren

array([10465, 15945])

In [9]:
words = vectorizer.get_feature_names()

# Embeddings

## Bert Embedding & Attention

In [10]:
embedding_name = 'bert'

In [11]:
pretrained_weights = 'bert-base-uncased'

In [12]:
model = BertModel.from_pretrained(pretrained_weights, output_hidden_states=True, output_attentions=True)

In [13]:
tokenizer = BertTokenizer(vocab_file='resources/archive/bertvocab_' + dataset_name + '.txt', never_split=True, do_basic_tokenize=False)

In [14]:
tokenized_text = [tokenizer.tokenize(i) for i in dataset.text]

In [15]:
%%time
temp = []
for i in tqdm(tokenized_text):
    t = [j for j in i if j in words]
    temp.append(t)
    
tokenized_text = temp

100%|██████████| 20000/20000 [00:30<00:00, 656.53it/s]


CPU times: user 24.9 s, sys: 134 ms, total: 25.1 s
Wall time: 30.5 s


In [16]:
indexed_tokens = [tokenizer.convert_tokens_to_ids(i) for i in tokenized_text]

In [17]:
input_ids = keras.preprocessing.sequence.pad_sequences(indexed_tokens, padding='post', dtype='long', maxlen=max([len(i) for i in indexed_tokens]))

In [18]:
input_ids = torch.tensor(input_ids)

In [19]:
input_ids = torch.split(input_ids, 1000, dim=0)

In [20]:
len(input_ids)

20

In [21]:
pad_length = [len(i) for i in indexed_tokens]

In [22]:
idx = 0
similar_words_bert = []
similar_words_bert_attention = []

In [None]:
for batch in input_ids:

    all_embeddings, _, _, all_attentions = model(batch)
    for one_attentions in all_attentions[0].detach().numpy():

#         one_side_edges = np.argmax(one_attentions[9], axis=1) #taking 9 layer of attention
        embeds = {words.index(j):[] for j in tokenized_text[idx]}
        
        for one_side_edges in np.argmax(one_attentions, axis=2): #taking all layers top confident value
            for j, i in enumerate(one_side_edges[:pad_length[idx]]):
                if i < pad_length[idx]:
                    embeds[words.index(tokenized_text[idx][i])] += [words.index(tokenized_text[idx][j])]
            
            for k, a in embeds.items():
                b = list(set(a))
                if k in b:
                    b.remove(k)
                embeds[k] = b
            
            similar_words_bert_attention.append(embeds)
        idx += 1
    print(idx)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000


In [None]:
pickle_out = open("resources/"+ dataset_name +"_" + 'bert_attention_all'+ ".pickle","wb")
pickle.dump(similar_words_bert_attention, pickle_out)
pickle_out.close()

In [None]:
# for batch in tqdm(input_ids):

#     all_embeddings, _, _, all_attentions = model(batch)
#     idx_copy = deepcopy(idx)
    
#     print(idx)
#     for one_embedding in all_embeddings.detach().numpy():
#         word_embeddings = one_embedding[:pad_length[idx]]
#         word_similarity = cosine_similarity(word_embeddings)
#         remove = np.where(word_similarity == 1.000) # to remove self words coupling

#         for i, j in zip(remove[0], remove[1]):
#             word_similarity[i][j] = 0
#             word_similarity[j][i] = 0

#         word_similarity = word_similarity > cutoff
#         word_similarity = word_similarity.astype(int)
#         np.fill_diagonal(word_similarity, 0)

#         inds = np.where(word_similarity==1)
#         embeds = {words.index(j):[] for j in tokenized_text[idx]}

#         for i, j in zip(inds[0], inds[1]):
#             embeds[words.index(tokenized_text[idx][i])] += [words.index(tokenized_text[idx][j])]
#         similar_words_bert.append(embeds)
    
#     idx = deepcopy(idx_copy)
    
#     print(idx)
    
#     for one_attentions in all_attentions[0].detach().numpy():

#         one_side_edges = np.argmax(one_attentions[9], axis=1) #taking 9 layer of attention
#         embeds = {words.index(j):[] for j in tokenized_text[idx]}

#         for j, i in enumerate(one_side_edges[:pad_length[idx]]):
#             if i < pad_length[idx]:
#                 embeds[words.index(tokenized_text[idx][i])] += [words.index(tokenized_text[idx][j])]
#         similar_words_bert_attention.append(embeds)
#         idx += 1
#     print(idx)

In [None]:
# pickle_out = open("resources/"+ dataset_name + "_" + str(n_docs) +"_" + 'bert' + "_" + str(cutoff) + ".pickle","wb")
# pickle.dump(similar_words_bert, pickle_out)
# pickle_out.close()

In [None]:
# pickle_out = open("resources/"+ dataset_name + "_" + str(n_docs) +"_" + 'bert_attention'+ ".pickle","wb")
# pickle.dump(similar_words_bert_attention, pickle_out)
# pickle_out.close()

### Appendix

In [None]:
# elmo = ElmoEmbedder()

In [None]:
        ### POS
#         pp = np.array([i[1] for i in nltk.pos_tag(words)])
#         pp[pp=='JJ'] = 1
#         pp[pp=='JJR'] = 1
#         pp[pp=='JJS'] = 1
#         pp[pp=='NN'] = 1
#         pp[pp=='NNS'] = 1
#         pp[pp=='NNP'] = 1
#         pp[pp=='NNPS'] = 1
#         pp[pp!='1'] = 0
#         pp = pp.astype(int)

#         wordOccuranceMatrixBinary[:, np.where(pp!=1)[0]] = 0


In [None]:
# wordOccuranceMatrixBinary[0].sum()

# np.sum(wordOccuranceMatrixBinary)

# Counter(np.array([i[1] for i in nltk.pos_tag(words)]))

# pp.sum()

# np.where(pp!=1)[0].shape

In [None]:
# %%time
# for embedding_name in ['bert', 'elmo']:
#     for cutoff in cutoffs:
#         print(embedding_name, cutoff)
#         pool = multiprocessing.Pool(n_cores)
#         similar_words = pool.map(get_edges_transformers, dataset.text.tolist())
#         pool.close()
#         pickle_out = open("resources/"+ dataset_name + "_" + str(n_docs) + "_" + embedding_name + "_" + str(cutoff) + ".pickle","wb")
#         pickle.dump(similar_words, pickle_out)
#         pickle_out.close()

In [None]:
#     pd = pd.apply(lambda x: convert_numbers(x))

In [None]:
# def process_df(df):
#     df['text'] = preprocess(df['reviewText'])
    
# #     pool = multiprocessing.Pool(n_cores)
# #     df['cleaned'] = pool.map(process_l, df['text'].tolist())
# #     pool.close()
    
# #     df['text'] = df['cleaned'].apply(lambda x: " ".join(x))
#     return df

In [None]:
# p = [item for sublist in dataset['cleaned'].tolist() for item in sublist]

In [None]:
# sorted(Counter(p))

In [None]:
# def process_l(s):
#     return [i.lemma_ for i in sp(s) if i.lemma_ not in '-PRON-']

In [None]:
# l = dataset['text'].tolist()

In [None]:
# pool = multiprocessing.Pool(n_cores)
# processed_l = pool.map(process_l, l)
# pool.close()

In [None]:
# joblib.dump(sampler, "resources/sampler_20iter_0.5_1")

In [None]:
# pickle_out = open("resources/amazon_muiscal_glove_0.4.pickle","wb")
# pickle.dump(similar_words, pickle_out)
# pickle_out.close()