# Load libraries

We are using Pytorch for the extracting word embeddings part. We leverage transformers - the pytorch interface for BERT by Hugging Face.

If you're running this code on Google Colab, you will have to install transformers library each time you reconnect.


In [1]:
# !pip install pytorch-pretrained-bert
!pip install transformers
import torch
#from pytorch_pretrained_bert import BertModel, BertForMaskedLM
from transformers import BertModel, BertTokenizer

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)
import os

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/cd/38/c9527aa055241c66c4d785381eaf6f80a28c224cae97daa1f8b183b5fabb/transformers-2.9.0-py3-none-any.whl (635kB)
[K     |▌                               | 10kB 18.2MB/s eta 0:00:01[K     |█                               | 20kB 2.2MB/s eta 0:00:01[K     |█▌                              | 30kB 2.9MB/s eta 0:00:01[K     |██                              | 40kB 3.2MB/s eta 0:00:01[K     |██▋                             | 51kB 2.6MB/s eta 0:00:01[K     |███                             | 61kB 2.9MB/s eta 0:00:01[K     |███▋                            | 71kB 3.1MB/s eta 0:00:01[K     |████▏                           | 81kB 3.4MB/s eta 0:00:01[K     |████▋                           | 92kB 3.6MB/s eta 0:00:01[K     |█████▏                          | 102kB 3.5MB/s eta 0:00:01[K     |█████▊                          | 112kB 3.5MB/s eta 0:00:01[K     |██████▏                         | 122kB 3.5M

# Load the model


First, we mount the Google drive so that we can access the model stored in Google Drive directory. 

In [2]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Setting path variables:

We pre-trained the model and saved the tf checkpoint. Now we are going to use pytorch. Pytorch can not load a model stored with tf checkpoint.

We have to first convert the tf checkpoint to pytorch model.

my_dir is the Google Drive directory where you have stored your tf checkpoint with config file and vocab.txt.
bert_model_ckpt is the checkpoint
bert_model_config_file is the config file

Set BUILD_PYTORCH_MODEL = False if you have pytorch model stored in your Google Drive directory

We then import the pretrained BERT model, and a tokenizer from Google drive directory.

In [0]:
# Main Directory for the project
my_dir = "___" 

# Setting path to save Bert model
bert_model_dir = os.path.join(my_dir, '<folder_where_bert_model_is>')
bert_model_ckpt = 'xxx.ckpt'
bert_model_config_file = 'config.json'        # Should be always config.json.

# Directory where all the file are kept for processing.
text_file_path = os.path.join(my_dir, 'Sample_Data_2')

# Path where ouput can be stored.
save_dir = os.path.join(my_dir, "Outputs")
# os.mkdir(save_dir)      # if the directory doesn't exist.
output_file_name = "output_300_cluster_200_iter.csv"

In [0]:
import os
import shutil

BUILD_PYTORCH_MODEL = False #@param {type:"boolean"}

if BUILD_PYTORCH_MODEL:
  
  cur_dir = os.getcwd()
  os.mkdir('PyTorch_Model_Build')
  os.chdir('PyTorch_Model_Build')
  os.system('git clone https://github.com/huggingface/transformers')
  os.chdir("transformers")

  cmd = "transformers-cli convert --model_type bert --tf_checkpoint '{}' --config '{}' --pytorch_dump_output '{}'".format(
      os.path.join(bert_model_dir, bert_model_ckpt), 
      os.path.join(bert_model_dir, bert_model_config_file),
      os.path.join(bert_model_dir, 'pytorch_model.bin')
      )
  out_print = os.system(cmd)
  print(out_print)
  os.chdir(cur_dir)
  shutil.rmtree('PyTorch_Model_Build')

# Load a trained model and vocabulary that you have fine-tuned
model = BertModel.from_pretrained(bert_model_dir)
tokenizer = BertTokenizer.from_pretrained(bert_model_dir)

BERT tokenizer stores a dictonary that maps the tokens in vocab.txt to the line number for those words in vocab.txt. This dictionary can be accessed by calling tokenizer.vocab

In tokenizer.vocab, key are the tokens and the values are their line number

We store a dictionary where keys are the line numbers and the values are the tokens at those line numbers.

In [0]:
vocab_dict = dict([(value, key) for key, value in tokenizer.vocab.items()]) 

We store all the wiki words from vocab.txt in a set

In [6]:
wiki_vocab_words = set()
for word in tokenizer.get_vocab():
  if word.startswith("wiki") and word is not "wikipedia":
    wiki_vocab_words.add(word.lower())

print(wiki_vocab_words)

{'wikiaccountancyzsqe', 'wikihomeworkezbsfb', 'wikiemailqlea', 'wikistemfieldsefelbbe', 'wikiworkexperienceiiifloe', 'wikifascismiiosf', 'wikiwikiezasi', 'wikisakezasoo', 'wikilaptopiqasaf', 'wikisocialmediasaqllfz', 'wikisocialscienceszblai', 'wikisexismzlibs', 'wikinonprofitorganizationlzfal', 'wikielectricalengineeringqsei', 'wikiwhitelistiazebq', 'wikiundergraduateeducationzilzaq', 'wikiprofessoriblaoo', 'wikidoctorofphilosophyalls', 'wikigraphicsinterchangeformatizloz', 'wikiunitedstatesefeflso', 'wikidudeeesleq', 'wikiempathyeozeiq', 'wikidirectimagefunctoreozzlff', 'wikicomputerscienceseze', 'wikicapitalismsfib', 'wikirussiazseqi', 'wikiunemploymenteilfi', 'wikitenureeifqsi', 'wikiaccountabilityibiqle', 'wikiblogeebfs', 'wikifeminismiiias', 'wikiwikipediasofelef', 'wikiliberalartsiablf', 'wikipoliticalsciencezfeaa', 'wikirhetoriczsffl', 'wikiivyleagueifqls', 'wikipermalinkalaqzb', 'wikistandardizedtestzzsqli', 'wikilolzaaelq', 'wikijournalismisqza', 'wikisyllabusiqllzas', 'wikis

#Retrieve Embeddings


In [7]:
# !pip install nltk         # install required if package not present
import nltk
nltk.download('punkt')      # Download required for utilizing punkt tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Function to identify the wiki words in a sentence.
* We have to identify if the wikiword is present in the dictionary or not.
* If the wikiword is present in the dictionary, then it is processed to remove brackets and lower case.
* If the wiki words is not present then it is broken into its original english words and a mapping is maintained.

In [0]:
import re
def de_wikify_non_selected(sent, tokenizer_vocab):

  dewikified_words = {}
  wiki_vocab_map = {}
  wiki_words_from_sent = re.findall(r"[wW][iI][kK][iI]_\w+", sent)   # words : [wiki_life_of_pie_cdde, wiki_facebook_ppl]
  # print(wiki_words_from_sent)

  for word in wiki_words_from_sent:                     # mod_words : [wikilifeofpiecdde, wikifacebookppl]
    
    modified_wiki_word = word.replace("_", "")

    if modified_wiki_word.lower() in tokenizer_vocab:
      new_wiki_word = modified_wiki_word.lower()            # modify wikification
      wiki_vocab_map[new_wiki_word] = word

    else:
      word_r = re.sub("_+", "_", word)
      new_wiki_word = " ".join(word_r.split("_")[1:-1])  # de-wikify - wiki__life_of_pie__ccdde -> life of pie
      dewikified_words[word] = new_wiki_word.split()   # Save tokens {"wiki_life_of_pie_ccdde" : ['life', 'of','pie']}
    
    sent = sent.replace(word, new_wiki_word)       

  return sent, dewikified_words, wiki_vocab_map, len(wiki_words_from_sent)


*   Code to sample test function : de_wikify_non_selected(sent, wiki_vocab_words)

In [9]:
sent = "gt go get a  Wiki__Liberal_arts__iablf  degree n ht s i imgur"
print(de_wikify_non_selected(sent, wiki_vocab_words))
re.findall(r"wiki_\w+", sent.lower())

('gt go get a  wikiliberalartsiablf  degree n ht s i imgur', {}, {'wikiliberalartsiablf': 'Wiki__Liberal_arts__iablf'}, 1)


['wiki__liberal_arts__iablf']

BERT expects input in specific format. [CLS] token marks the start of the sentence and [SEP] token marks the end of the sentence. For example, say we want to pass ‘This is my test sentence.’ As an input to get the word embeddings. The expected input in this case will be [CLS] This is my test sentence. [SEP]

We passed one sentence at a time to the BERT model. The BERT tokenizer tries to convert the input sentence into different tokens. Initially, the input sentence is split on whitespaces giving the tokenizer all the words in the sentence. The aim of the tokenizer is to keep the all the words intact. Hence, the BERT tokenizer will try to map every word it finds in input to some token defined in vocab.txt. If it doesn’t find a token to map for any word, it will break down the word into smaller tokens. For example, vocab.txt doesn’t contain the word ‘embeddings’. Hence, the tokenizer splits the word embeddings into four different tokens [‘em’, ‘##bed’, ‘##ding’, ‘##s’] so that it retains some contextual meaning of the word.

BERT is trained on sentence pairs and it expects sentence pairs as an input. We usually distinguish between sentences in sentence pairs using 1s and 0s. In our case, we will always pass one sentence at a time. Hence, we create a vector of length same as the length of sentence tokens having all 1s. We pass this vector as segment IDs.

We get full set of hidden states as an output when we pass token IDs and segment IDs to the BERT model. This object has four dimensions, in the following order:
- The layer number (12 layers)
- The batch number (1 sentence)
- The token number (number of tokens in our sentence)
- The hidden units (768 features)

We further get rid of the batches dimension as batch number is always 1.
Now, every token is represented by 12 layers and every layer has 768 hidden units. We add up last four layers to create word vector for any particular token. Word vector of every word is a now an array of 768 items. In other words, we have every token represented in a 768-dimentional space.

For all the important lines the comments are added in the cell below.

In [0]:
def get_sentence_word_vetors(sentence, tokenizer):

  input_ids = []
  sent_word_vectors = []
  encoded_dict = tokenizer.encode_plus(
                  sentence,                      # Sentence to encode.
                  add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                  max_length = 512,           # Pad & truncate all sentences.
                  pad_to_max_length = True,
                  return_attention_mask = True,   # Construct attn. masks.
                  return_tensors = 'pt',     # Return pytorch tensors.
              )

  input_ids.append(encoded_dict['input_ids'])
  tokens_tensor = torch.cat(input_ids, dim=0)


  # Mark each of the 512 tokens as belonging to sentence "1".
  segments_ids = [1] * 512



  # Convert inputs to PyTorch tensors
  #tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Put the model in "evaluation" mode, meaning feed-forward operation.
  model.eval()

  # Predict hidden states features for each layer
  with torch.no_grad():
      encoded_layers, _ = model(tokens_tensor, segments_tensors)
    
  # Concatenate the tensors for all layers. We use `stack` here to
  # create a new dimension in the tensor.
  #token_embeddings = torch.stack(encoded_layers, dim=0)
  token_embeddings = encoded_layers
  # token_embeddings.size()

  # Remove dimension 1, the "batches".
  token_embeddings = torch.squeeze(token_embeddings, dim=1)

  # Swap dimensions 0 and 1.
  token_embeddings = token_embeddings.permute(1,0,2)


  # Stores the token vectors, with shape [512 x 768]
  token_vecs_sum = []
  # For each token in the sentence...
  for token in token_embeddings:

      # Sum the vectors from the last four layers.
      sum_vec = torch.sum(token[-4:], dim=0)
      
      # Use `sum_vec` to represent `token`.
      token_vecs_sum.append(sum_vec)

  # print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

  for i in range(512):
    tok = (((encoded_dict['input_ids'])[0])[i]).item()
    
    vocab_word = vocab_dict[tok] 
    word_vector = token_vecs_sum[i]
    sent_word_vectors.append([vocab_word, word_vector]) # [["i", vec1], ["love", vec2], ["wikifacebook", vec3]]

  return sent_word_vectors

Processing Sentences which are in the form of an array: 
Simply removing the brackets and each string in one enclosed bracket is treated as a separate sentence


In [0]:
def process_arrays(data):
  return [sent.replace("[", "").replace("]","").strip() for sent in data.split("][")]

A quick sample test to check the word embedding on words with different meanings in different context. 

In [12]:
import numpy as np
sent = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."

sent_word_vecs = get_sentence_word_vetors(sent, tokenizer)

# for i, word in enumerate(sent_word_vecs):
#   print(i, word[0])

bank1 = sent_word_vecs[6][1].data.numpy()
bank2 = sent_word_vecs[10][1].data.numpy()
bank3 = sent_word_vecs[24][1].data.numpy()

def cos_sim(a,b):
  return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

print(cos_sim(bank1, bank2))

print(cos_sim(bank1, bank3))
print(cos_sim(bank2, bank3))

0.8307625
0.7632531
0.75604004


Function for processing data :


*   All the text files which need to be processed are expected to be kept in one directory.
*   Path of the directory - txt_file_path
*   Each file data is extracted and sentences are separated:
>*  For normal text we are using nltk tokeniser.
>*  For text files containing brackets [], each string in enclosed brackets is a sentence.

*   Embeddings for all the words in a sentence are extracted, however only the wikiwords embeddings are stored from each document.
*   If sentence has no wikiword then it is not processed at all.







In [13]:
import glob
from nltk.tokenize import sent_tokenize
import re


# Running embedding extraction on all the files kept in - text_file_path
files = glob.glob(text_file_path + os.sep + '/*.txt')
doc_num = 0

selected_word_vectors = []

for name in files:
  name_after_split = name.split('/')
  doc_name = name_after_split[len(name_after_split) -1]       # Maintaining document name
  doc_num += 1

  print("Running document : ", doc_name)  

  with open(name, encoding="utf-8") as f:
    
    count = 0
    for text in f:
      
      if re.search("\]\s*\[", text):                    # Check if file contains brackets
        sents = process_arrays(text)
      else:
        sents = sent_tokenize(text)                     # or normal sentences
      
      dewikified_words = {}
      
      for line in sents:
        
        print(line)
        
        mod_line, dewikified_words, wiki_vocab_map, wiki_count = de_wikify_non_selected(line, wiki_vocab_words)

        if wiki_count == 0:
          continue
        
        print("Processing line : ", mod_line)

        sent_word_vetors = get_sentence_word_vetors(mod_line, tokenizer)


        # After extracting word embeddings, we are storing only the wikiwords embeddings

        for word, vec in sent_word_vetors:
          if word.lower() in wiki_vocab_words:
            selected_word_vectors.append([doc_name, wiki_vocab_map[word.lower()], vec])

        # Reconstructing the embeddings for the wikiwords which were not among the top 100 selected.
        # Averaging the Embeddings of the individual english words belonging to a wikiword.
        # If the individual english word is also broken in bert tokenisation then the embedding is not stored.
        for key in dewikified_words:

          print(key)

          dewiki_vector = []
          dewiki_index = -1
          for word, vec in sent_word_vetors:

            if word.lower() == dewikified_words[key][dewiki_index+1].lower():   # {"wiki_life_of_pie_ccdde" : ['life', 'of','pie']}
              dewiki_vector.append(vec)
              dewiki_index += 1
            else:
              dewiki_index = -1
              dewiki_vector = []

            if dewiki_index >= len(dewikified_words[key]) - 1:
              avg_vec = torch.mean(torch.stack(dewiki_vector), dim=0)
              selected_word_vectors.append([doc_name, key, avg_vec])
              dewiki_index = -1
              dewiki_vector = []


Running document :  politics-all-subreddits-2017-12-2700.txt
butler county just swung by points towards jones granted it is a small county but still
what i am noticing is that even in counties moore is winning hes consistently down percent or more from the election if the urban centers have strong turn outs that could be enough
no doubt there but it is feasible hes basically running against the least palatable republican unless they reanimated adolf hitler and its still a close race
ba dum tish
too many people who call themselves religious miss the point of it they consider church and  Wiki__Righteousness__zqofel  to be the price you have to pay to be saved thats completely missing the point i may be catholic but im not draconian jewish muslim  Wiki__Atheism__iszflsfz  it doesnt matter to me my belief is that if you are a good and just person then god will not turn his back on you i mean if im being honest you might spend some time in  Wiki__Purgatory__ifleleio  for not recognizing the

# Cluster Words 1 
Implementing clustering using distance as cosine similarity

In [15]:
from nltk.cluster import KMeansClusterer
import nltk
import numpy as np 
from copy import deepcopy
from sklearn import cluster
from sklearn import metrics

all_tensors = []
all_words = []
for it in selected_word_vectors:
	all_words.append(deepcopy(it[1]))
	all_tensors.append(deepcopy(it[2]))

NO_OF_ITERATIONS = 15
NUM_OF_CLUSTERS = 2

kclusterer = KMeansClusterer(NUM_OF_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=NO_OF_ITERATIONS)
assigned_clusters = kclusterer.cluster(all_tensors, assign_clusters=True)


new_list = []
for x in range(NUM_OF_CLUSTERS):
  new_list.append([])

for index, word in enumerate(all_words):    
    new_list[assigned_clusters[index]].append(str(word))
	
for i,l in enumerate(new_list):
  print("Cluster {}:\n".format(i))
  print(l,"\n")

Cluster 0:

['Wiki__Photo_shoot__ioeqbbal', 'Wiki__Wikipedia__sofelef', 'Wiki__Marxism__iqofose', 'Wiki__Fascism__iiosf', 'Wiki__Ideology__lsfbq', 'Wiki__Fascism__iiosf', 'Wiki__Capitalism__sfib', 'Wiki__Fascism__iiosf', 'Wiki__Capitalism__sfib', 'Wiki__Fascism__iiosf', 'Wiki__Capitalism__sfib', 'Wiki__Capitalism__sfib', 'Wiki__Capitalism__sfib', 'Wiki__Capitalism__sfib', 'Wiki__Socialism__zbafl', 'Wiki__Democracy__lqsq', 'Wiki__Democracy__lqsq', 'Wiki__Democracy__lqsq', 'Wiki__Nationalism__zilfa', 'Wiki__Nationalism__zilfa', 'Wiki__Propaganda__zezoe', 'Wiki__Propaganda__zezoe', 'Wiki__Representative_democracy__szeia', 'Wiki__Direct_democracy__qsaib', 'Wiki__Sexual_minority__esazbsf', 'Wiki__Learning_disability__ioqlooaz', 'Wiki__Executive_system__laefls', 'Wiki__Silicon_Valley__zbqlb'] 

Cluster 1:

['Wiki__Idiot__isoof', 'Wiki__Facebook__lszqela', 'Wiki__Barack_Obama__sefebb', 'Wiki__Shit__eazzi', 'Wiki__Fuck__sslslzz', 'Wiki__Sarcasm__zqsaql', 'Wiki__Russia__zseqi', 'Wiki__Russia__z

# Cluster the words - 2
Implementing clustering using eucledian distance.


In [16]:
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from copy import deepcopy


NO_OF_ITERATIONS = 20
NUM_OF_CLUSTERS = 5

all_tensors = []
all_words = []
for it in selected_word_vectors:
  all_words.append(it[1])
  all_tensors.append(it[2].data.numpy())


def clustering_on_wordvecs(word_vectors, num_clusters):
    # Initalize a k-means object and use it to extract centroids
    kmeans_clustering = KMeans(n_clusters = num_clusters, init='k-means++', n_init=NO_OF_ITERATIONS);
    idx = kmeans_clustering.fit_predict(word_vectors);
    
    return kmeans_clustering.cluster_centers_, idx;

import time
start = time.time();
centers, clusters = clustering_on_wordvecs(all_tensors, NUM_OF_CLUSTERS);
print('Total time: ' + str((time.time() - start)) + ' secs')

start = time.time();
centroid_map = dict(zip(all_words, clusters));
print('Total time: ' + str((time.time() - start)) + ' secs')

rev_map = {}
for key in centroid_map:
  if centroid_map[key] not in rev_map:
    rev_map[centroid_map[key]] = [key]
  else:
    rev_map[centroid_map[key]].append(key)

final_csv_data = []
for i, word_vec in enumerate(selected_word_vectors):
  final_csv_data.append([word_vec[0], word_vec[1], clusters[i]])


for key in rev_map:
  print(rev_map[key])

Total time: 0.22172927856445312 secs
Total time: 0.0001373291015625 secs
['Wiki__Idiot__isoof', 'Wiki__Facebook__lszqela', 'Wiki__Barack_Obama__sefebb', 'Wiki__Russia__zseqi', 'Wiki__Wikipedia__sofelef', 'Wiki__Wiki__ezasi', 'Wiki__United_States__efeflso', 'Wiki__California__sfol', 'Wiki__Unemployment__eilfi', 'Wiki__Reddit__eazqoos', 'Wiki__Tuition__zfqezo', 'Wiki__Scholarship__iaoeel', 'Wiki__Tenure__eifqsi', 'Wiki__Undergraduate_education__zilzaq', 'Wiki__Interdisciplinarity__iszoi', 'Wiki__Social_sciences__zblai', 'Wiki__Working_time__ssziba', 'Wiki__Social_media__saqllfz']
['Wiki__Photo_shoot__ioeqbbal', 'Wiki__Scientific_method__zbaee', 'Wiki__Educational_toy__iboiblb', 'Wiki__Executive_system__laefls', 'Wiki__Silicon_Valley__zbqlb', 'Wiki__Real_point__aoaaqeq', 'Wiki__Single_parent__zzbioq', 'Wiki__Rights_issue__ibieeif', 'Wiki__Wage_slavery__selob', 'Wiki__Consumer_organization__qeofzo', 'Wiki__Charter_school__eeibqe', 'Wiki__Sales__issbqa', 'Wiki__Index_fund__ioboiz', 'Wiki__B

# Develop CSV


Exporting the result output to a csv.


In [0]:
# CSV 1 row : doc_name, word, cluster_id
import csv

with open(os.path.join(save_dir, output_file_name), "w") as f:
    writer = csv.writer(f)
    writer.writerows(final_csv_data)