<a href="https://colab.research.google.com/github/AlirezaPNouri/BERTEmbedding/blob/main/BertEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is an optimized version of BERT which uses the BERT embedding instead of Bert classification.

In [5]:
!pip install transformers



In [6]:
import requests
import numpy as np
import pandas as pd
import string

In [55]:
MAX_DATA = 100
MAX_SENTENCE_LENGTH = 500
MIN_SENTENCE_LENGTH = 200
doc_list =[item for item in range(MAX_DATA)]
jump = 3 # jump size for gradient decent
MAX_FEATURE_SIZE = 20 # max size of the gradient descent vector
range_n_clusters = [2, 3, 4, 5, 6] #number of different clustering to compare the performance of the model over different clustering

In [8]:
# download the dataset as a zip file from the git repo
print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://raw.githubusercontent.com/AlirezaPNouri/BERTEmbedding/main/5KArticles.csv'
res = requests.get(url, allow_redirects=True)
with open('small_dataset_NYT.csv','wb') as file:
    file.write(res.content)
print('Download is done!')

Downloading dataset...
Download is done!


In [9]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv("/content/small_dataset_NYT.csv", header= None, skiprows=1)
print('The original size of dataset is {}'.format(df.shape))
df = df.dropna(how='any', axis=0)

# Report the number of sentences.
print('Number of training sentences extracted from dataset is {:,}\n'.format(df.shape[0]))
# Display 10 random rows from the data.
# print(df.sample(10))
df = df[:][[1,2]]
df.columns = ['id', 'content']
df['text_length'] = df['content'].apply(lambda x : len(x.split(' ')))
df = df[df['text_length']>= MIN_SENTENCE_LENGTH]
df['content'] = df['content'].apply(lambda x : ' '.join(x.split(' ')[:MAX_SENTENCE_LENGTH]))
df = df[0:MAX_DATA][['content']]
print('dataset shape is {}'.format(df.shape))
print('Columns are : {}'.format([name for name in df.columns]))
print('The dimension of the dataset is {}'.format(df.shape))

The original size of dataset is (5001, 3)
Number of training sentences extracted from dataset is 5,001

dataset shape is (100, 1)
Columns are : ['content']
The dimension of the dataset is (100, 1)


In [None]:
#run Bert over documents to generate embedding for each documeny and tokens
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
# input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
# outputs = model(input_ids)
# last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

# print(tokenizer.tokenize("Hello, my dog is cute"))
# print(last_hidden_states.shape) # the first and the last ones are the cls and sep
Embedding_dict = {}
for id, doc in zip(range(MAX_DATA),df['content']):
  Embedding_dict[id] = model(tf.constant(tokenizer.encode(doc))[None, :])[0]
  print('Document {} is done!'.format(id))
# print(Embedding_dict)

In [None]:
# create a dictionary out of the embedding for each document and all tokens in the document
# the structure of doc_word_embedding:  {doc0: {'cls': embedding_vec, 'ali': embedding_vec... 'sep': embedding_vec}, doc1: {}} --> {doc#: {word: embedding_vec}}

doc_word_embedding = {}
for doc in range(MAX_DATA):
  tmp_list = dict()
  temp_list = ()
  temp_list = np.array(Embedding_dict[doc][0])
  tmp_list['cls']= temp_list[0]
  for word, embedding in zip(tokenizer.tokenize(df['content'].iloc[doc]), temp_list[1:-1]):
    #filter some unuseful tokens
    if len(word)>2 and word not in string.punctuation:
      tmp_list[word]=embedding
  tmp_list['sep']= temp_list[-1]
  doc_word_embedding[doc] = tmp_list
  print('doc {} is done!'.format(doc))


In [18]:
#Function to find similarities
from sklearn.metrics.pairwise import cosine_similarity
def find_similar(embedding_dict):
  score_dic = dict()
  word_list = list(embedding_dict.keys())[1:-1] #ignore 'cls' and 'sep' tokens
  for word in word_list:
    temp_list = list()
    for neighbor in word_list:
      temp_list.append(cosine_similarity([ embedding_dict[word]], [embedding_dict[neighbor] ])[0][0])
    score_dic[word] = sorted(temp_list, reverse=True)[1:]# ignore the cosine similarity between each word and itself
  return score_dic

In [None]:
# create the neighbor similarity vector for each word in each document
# the structure of it is {doc0: {token1: similarity distribution, token2: similarity distribution}, doc1: ...}. be careful the size of each neighbor vector is one less than the number of tokens in each document
word_neighbor_similarity = dict()
for doc in doc_word_embedding.keys():
  word_neighbor_similarity[doc] = find_similar(doc_word_embedding[doc])
  print('doc {} is done'.format(doc))


In [45]:
print(word_neighbor_similarity[3]['despite'])

[0.64051676, 0.58452755, 0.56511635, 0.5548535, 0.54290926, 0.5135425, 0.51309526, 0.5109051, 0.5035542, 0.49692708, 0.49628136, 0.47123325, 0.4648232, 0.4627166, 0.45677203, 0.45496714, 0.4527735, 0.45257795, 0.44931555, 0.4294933, 0.42622155, 0.42570502, 0.42535168, 0.4253087, 0.42384106, 0.4233408, 0.42186263, 0.4157315, 0.414689, 0.41300115, 0.41061467, 0.40746665, 0.39894292, 0.39807484, 0.39604592, 0.39546376, 0.3857022, 0.38423222, 0.38123444, 0.38118744, 0.38025457, 0.3736773, 0.37283686, 0.37184757, 0.3718049, 0.37087893, 0.37052423, 0.36995998, 0.3692636, 0.36876047, 0.366902, 0.3646148, 0.36326277, 0.36170065, 0.36019766, 0.3591764, 0.35829365, 0.35526678, 0.3551512, 0.35510546, 0.35455137, 0.35448137, 0.35408294, 0.35392505, 0.35386336, 0.35359597, 0.35270667, 0.35210943, 0.34991646, 0.34986964, 0.34955424, 0.34895927, 0.34672856, 0.34666568, 0.34580514, 0.34531197, 0.34316522, 0.3418697, 0.34046656, 0.34012195, 0.34009957, 0.33893055, 0.3388222, 0.3372243, 0.33662128, 0.33

In [None]:
from transformers.utils.dummy_tf_objects import TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST
##Calculate the gradient descent of the similarity distribution
## MAX_FEATURE_SIZE will cut the rest of the features
gradient_descent_neighbors = dict()
for doc in word_neighbor_similarity.keys():
  temp_dict = dict()

  for word, embedding in word_neighbor_similarity[doc].items():
    vec_size = len(embedding)
    vec_size -= vec_size%jump
    embedding = embedding[0:vec_size]
    new_list = [embedding[x:x+jump] for x in range(0, vec_size, jump)]
    tmp_list = list()
    for ele in new_list:
      m1,b1 = np.polyfit(np.arange(0, jump), ele, 1)
      tmp_list.append(m1)
    temp_dict[word] = tmp_list
  gradient_descent_neighbors[doc] = temp_dict
  print('Document {} is done!'.format(doc))

  

In [42]:
from numpy import median, array, exp
#sigmoid function
def sigmoid(vec_):
  z = array(vec_)
  z = z/np.linalg.norm(z)
  g = 1 / (1 + exp(-z))
  return g

In [50]:
for doc in gradient_descent_neighbors.keys():
  for word, gradient_vec in gradient_descent_neighbors[doc].items():
    gradient_descent_neighbors[doc][word] =sigmoid(gradient_descent_neighbors[doc][word][0:MAX_FEATURE_SIZE])# just take top features and apply sigmoid function

In [56]:
from sklearn.cluster import KMeans
# clustering the tokens in each document. It creates differnt number of clustering
all_clusters = dict()
all_centroid = dict()
for doc, featurs in gradient_descent_neighbors.items():
  clusters_per_doc = dict()
  centroids_per_doc = dict()
  for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=100,)
    clusters_per_doc[n_clusters] = clusterer.fit_predict(list(featurs.values()))
    centroids_per_doc[n_clusters] = clusterer.cluster_centers_
  all_clusters[doc] = clusters_per_doc
  all_centroid[doc] = centroids_per_doc
  print('Document {} is done!'.format(doc))
     

In [78]:
#create a total clusters sorted dictionary
## the structure of total_docs_clusters  is {doc0: {number_of_clusters { cluster_number: {token0: closeness_to_centroid, token1: closeness_to_centroid ...} }}}
total_docs_clusters = dict()
for doc in all_clusters.keys():
  temp_cluster = all_clusters[doc]
  dic_for_clustering = dict()
  for n_cluster in range_n_clusters:
    temp_dict = dict()
    for la in range(n_cluster):
      temp_list = dict()
      for label_, value_ in zip(temp_cluster[n_cluster],gradient_descent_neighbors[doc].items()):
        if label_ == la:
          temp_list[value_[0]] = np.sum(np.square(value_[1] - all_centroid[doc][n_cluster][la]))
      temp_dict[la]=temp_list
    dic_for_clustering[n_cluster] = temp_dict
  total_docs_clusters[doc] = dic_for_clustering
  
    
      

In [77]:
# import pickle

# pickle.dump(df, open("df.p","wb"))
# pickle.dump(Embedding_dict, open("Embedding_dict.p", "wb"))
# pickle.dump(doc_word_embedding, open("doc_word_embedding.p","wb"))
# pickle.dump(word_neighbor_similarity, open("word_neighbor_similarity.p","wb"))
# pickle.dump(gradient_descent_neighbors, open("gradient_descent_neighbors.p","wb"))
# pickle.dump(total_docs_clusters, open("total_docs_clusters.p","wb"))
# pickle.dump(all_clusters, open("all_clusters.p","wb"))
# pickle.dump(all_centroid, open("all_centroid.p","wb"))

# df = pickle.load(open("df.p", "rb"))
# Embedding_dict = pickle.load(open("Embedding_dict.p", "rb"))
# doc_word_embedding = pickle.load(open("doc_word_embedding.p", "rb"))
# word_neighbor_similarity = pickle.load(open("word_neighbor_similarity.p", "rb"))
# total_docs_clusters = pickle.load(open("total_docs_clusters.p", "rb"))
# all_clusters = pickle.load(open("all_clusters.p", "rb"))
# all_centroid = pickle.load(open("all_centroid.p", "rb"))

In [76]:
print(total_docs_clusters[0])

{2: {0: {'hurrying': 0.014334454281205427, 'through': 0.024271209334057547, 'the': 0.016466063089275618, 'veterans': 0.01057808828276694, 'guard': 0.010406854711459945, 'new': 0.014575321405436081, 'yelled': 0.00922522563525655, 'warm': 0.00998651367828293, 'huh': 0.014299272795611271, 'grin': 0.01196944871267222, 'room': 0.01257380783044098, 'wild': 0.005893532163674162, 'philadelphia': 0.02356714565081841, 'eagles': 0.008064060730212059, 'cold': 0.008152779266458863, 'qualified': 0.008441635526562834, 'but': 0.023569344884982423, 'until': 0.010136276509536921, 'time': 0.00875520217791407, 'teams': 0.01526858105802071, 'minnesota': 0.016405918346635427, 'metro': 0.011629377955285014, '##dome': 0.024294921778461667, 'washington': 0.0074125138276196325, 'and': 0.034586827581737624, 'green': 0.016517300514080895, 'being': 0.010200571798814395, 'fired': 0.01799521918403016, 'going': 0.005585028947087353, 'right': 0.02918924862791166, 'john': 0.015623067128077624, 'this': 0.007162647127223