# Embedding, Clustering and Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertModel, BertForTokenClassification


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import logging
logging.basicConfig(level=logging.INFO)

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading: 100%|██████████| 226k/226k [00:01<00:00, 220kB/s]  
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 9.23kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 200kB/s]


In [4]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

model.eval()

Downloading: 100%|██████████| 420M/420M [04:18<00:00, 1.70MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [6]:
#Loading the data including merged and cleaned data
uva = pd.read_csv("data/uva_keywords_2022-06-04-16-57-48_266864.csv")
uva.head()

Unnamed: 0,University,Abbreviation,Department,Course title,Unit,Professor,Objective,Prerequisite,Required Skills,Outcome,...,merged_clean,keywords_1_1,keywords_1_2,keywords_1_3,keywords_2_1,keywords_2_2,keywords_2_3,keywords_3_1,keywords_3_2,keywords_3_3
0,Universiteit van Amsterdam,UvA,AUC,A Golden Age? History and Heritage of the Dutc...,6 EC,,The students will learn about historical devel...,,,,...,called golden age corresponds roughly 17th cen...,"[('colonialism', 0.3153), ('literature', 0.304...","[('dutch history', 0.4774), ('17th century', 0...","[('17th century important', 0.537), ('netherla...",[],"[('dutch history', 0.4774), ('17th century', 0...","[('17th century important', 0.537), ('netherla...",[],[],"[('17th century important', 0.537), ('netherla..."
1,Universiteit van Amsterdam,UvA,AUC,Academic Writing Skills,6 EC,,Students will:\n\n• demonstrate digital litera...,,,,...,academic writing skill introduction academic s...,"[('literacy', 0.4732), ('academic', 0.4421), (...","[('literacy academic', 0.5888), ('skill academ...","[('skill academic reading', 0.6083), ('literac...",[],"[('literacy academic', 0.5888), ('skill academ...","[('skill academic reading', 0.6083), ('literac...",[],[],"[('skill academic reading', 0.6083), ('academi..."
2,Universiteit van Amsterdam,UvA,AUC,Adaptation Studies,6 EC,,Students will be able to understand and analyz...,All of the following:\n60 EC\nIntroduction to ...,,,...,although originality work literature art often...,"[('literature', 0.3591), ('cultural', 0.3162),...","[('study adaptation', 0.5093), ('adaptation st...","[('consider adaptation cultural', 0.5469), ('s...",[],"[('study adaptation', 0.5093), ('adaptation st...","[('consider adaptation cultural', 0.5469), ('s...",[],[],"[('consider adaptation cultural', 0.5469), ('s..."
3,Universiteit van Amsterdam,UvA,AUC,Addiction,6 EC,,• The student understands basic pharmacologica...,One of the following:\nBrain and Cognition\nMe...,,,...,goal course gain insight etiology neurobiology...,"[('addiction', 0.4446), ('addict', 0.4035), ('...","[('addiction student', 0.5854), ('drug addicti...","[('study drug addiction', 0.6908), ('addiction...",[],"[('addiction student', 0.5854), ('drug addicti...","[('study drug addiction', 0.6908), ('addiction...",[],[],"[('study drug addiction', 0.6908), ('addiction..."
4,Universiteit van Amsterdam,UvA,AUC,Advanced Creative Writing,6 EC,,1) The student builds off her/his prior knowle...,Creative Writing\nRegistration\nThis course is...,,,...,course continues diverse exploration creative ...,"[('writing', 0.4873), ('write', 0.4306), ('stu...","[('writing student', 0.6252), ('practice writi...","[('creative writing student', 0.7125), ('writi...",[],"[('writing student', 0.6252), ('practice writi...","[('creative writing student', 0.7125), ('writi...",[],[],"[('creative writing student', 0.7125), ('writi..."


In [9]:
#Discarding unnecessary columns
#necessary columns are:
# University, Abbreviation, Department, Course title, Unit, merged_clean
uva = uva[['University', 'Abbreviation', 'Department', 'Course title', 'Unit', 'merged_clean']]
uva.head()

Unnamed: 0,University,Abbreviation,Department,Course title,Unit,merged_clean
0,Universiteit van Amsterdam,UvA,AUC,A Golden Age? History and Heritage of the Dutc...,6 EC,called golden age corresponds roughly 17th cen...
1,Universiteit van Amsterdam,UvA,AUC,Academic Writing Skills,6 EC,academic writing skill introduction academic s...
2,Universiteit van Amsterdam,UvA,AUC,Adaptation Studies,6 EC,although originality work literature art often...
3,Universiteit van Amsterdam,UvA,AUC,Addiction,6 EC,goal course gain insight etiology neurobiology...
4,Universiteit van Amsterdam,UvA,AUC,Advanced Creative Writing,6 EC,course continues diverse exploration creative ...


In [24]:
def add_special_BERT_tokens(cleaned_text):
    """
    Add special tokens to the cleaned text
    """        
    if cleaned_text == "" or cleaned_text == " " or cleaned_text == None or (not isinstance(cleaned_text, str)):
        return "[NOTHING]"
    
    cleaned_text = "[CLS] " + str(cleaned_text) + " [SEP]"
    
    return cleaned_text


In [27]:
#add necessary tokens for BERT to merged_clean column and save it to a new column
uva['merged_clean_bert'] = uva['merged_clean'].apply(lambda x: add_special_BERT_tokens(x))

#remove every row with [NOTHING] as merged_clean_bert
uva = uva[uva['merged_clean_bert'] != "[NOTHING]"]

uva.head()

Unnamed: 0,University,Abbreviation,Department,Course title,Unit,merged_clean,merged_clean_bert
0,Universiteit van Amsterdam,UvA,AUC,A Golden Age? History and Heritage of the Dutc...,6 EC,called golden age corresponds roughly 17th cen...,[CLS] called golden age corresponds roughly 17...
1,Universiteit van Amsterdam,UvA,AUC,Academic Writing Skills,6 EC,academic writing skill introduction academic s...,[CLS] academic writing skill introduction acad...
2,Universiteit van Amsterdam,UvA,AUC,Adaptation Studies,6 EC,although originality work literature art often...,[CLS] although originality work literature art...
3,Universiteit van Amsterdam,UvA,AUC,Addiction,6 EC,goal course gain insight etiology neurobiology...,[CLS] goal course gain insight etiology neurob...
4,Universiteit van Amsterdam,UvA,AUC,Advanced Creative Writing,6 EC,course continues diverse exploration creative ...,[CLS] course continues diverse exploration cre...


In [29]:
#save uva to a csv file for convinent loading
uva.to_csv("data/uva_bert_tokens_added.csv", index=False)

#load uva from csv file
uva = pd.read_csv("data/uva_bert_tokens_added.csv")
uva.head()

Unnamed: 0,University,Abbreviation,Department,Course title,Unit,merged_clean,merged_clean_bert
0,Universiteit van Amsterdam,UvA,AUC,A Golden Age? History and Heritage of the Dutc...,6 EC,called golden age corresponds roughly 17th cen...,[CLS] called golden age corresponds roughly 17...
1,Universiteit van Amsterdam,UvA,AUC,Academic Writing Skills,6 EC,academic writing skill introduction academic s...,[CLS] academic writing skill introduction acad...
2,Universiteit van Amsterdam,UvA,AUC,Adaptation Studies,6 EC,although originality work literature art often...,[CLS] although originality work literature art...
3,Universiteit van Amsterdam,UvA,AUC,Addiction,6 EC,goal course gain insight etiology neurobiology...,[CLS] goal course gain insight etiology neurob...
4,Universiteit van Amsterdam,UvA,AUC,Advanced Creative Writing,6 EC,course continues diverse exploration creative ...,[CLS] course continues diverse exploration cre...


In [30]:
def tokenize_and_add_segment_and_token_ID(bert_text):
    """
    Tokenize the text and add segment ID
    """
    global tokenizer
    tokenized_text = tokenizer.tokenize(bert_text)
    tokenized_text_with_id = tokenizer.convert_tokens_to_ids(tokenized_text)
    segment_ids = [1] * len(tokenized_text)
    return tokenized_text, tokenized_text_with_id, segment_ids

In [31]:
#add tokenized text and tokenized text with ID and segment ID to uva
uva['tokenized_text'], uva['tokenized_text_with_id'], uva['segment_ids'] = zip(*uva['merged_clean_bert'].apply(lambda x: tokenize_and_add_segment_and_token_ID(x)))

uva.head()

Unnamed: 0,University,Abbreviation,Department,Course title,Unit,merged_clean,merged_clean_bert,tokenized_text,tokenized_text_with_id,segment_ids
0,Universiteit van Amsterdam,UvA,AUC,A Golden Age? History and Heritage of the Dutc...,6 EC,called golden age corresponds roughly 17th cen...,[CLS] called golden age corresponds roughly 17...,"[[CLS], called, golden, age, corresponds, roug...","[101, 2170, 3585, 2287, 14788, 5560, 5550, 230...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Universiteit van Amsterdam,UvA,AUC,Academic Writing Skills,6 EC,academic writing skill introduction academic s...,[CLS] academic writing skill introduction acad...,"[[CLS], academic, writing, skill, introduction...","[101, 3834, 3015, 8066, 4955, 3834, 2817, 3192...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Universiteit van Amsterdam,UvA,AUC,Adaptation Studies,6 EC,although originality work literature art often...,[CLS] although originality work literature art...,"[[CLS], although, original, ##ity, work, liter...","[101, 2348, 2434, 3012, 2147, 3906, 2396, 2411...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Universiteit van Amsterdam,UvA,AUC,Addiction,6 EC,goal course gain insight etiology neurobiology...,[CLS] goal course gain insight etiology neurob...,"[[CLS], goal, course, gain, insight, et, ##iol...","[101, 3125, 2607, 5114, 12369, 3802, 20569, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,Universiteit van Amsterdam,UvA,AUC,Advanced Creative Writing,6 EC,course continues diverse exploration creative ...,[CLS] course continues diverse exploration cre...,"[[CLS], course, continues, diverse, exploratio...","[101, 2607, 4247, 7578, 8993, 5541, 3015, 9009...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [42]:
#generate word embedding for each merged_clean_bert
def generate_model_output(indexed_token, segmentid):
    """
    Generate model output for given tokens
    """
    global model
    with torch.no_grad():
        outputs = model(torch.tensor([indexed_token]), torch.tensor([segmentid]))
        return outputs

def print_info_about_hidden_states(outputs, layer_index=0, batch_index=0, token_index=0):
    """
    Print information about hidden states
    """
    hidden_states = outputs[2]
    print("Hidden states:")
    #number of layers
    print("Number of layers:", len(hidden_states))

    #number of batches
    print("Number of batches:", len(hidden_states[layer_index]))

    #number of tokens
    print("Number of tokens:", len(hidden_states[layer_index][batch_index]))

    #number of hidden units
    print("Number of hidden units:", len(hidden_states[layer_index][batch_index][token_index]))

In [43]:
#test generate_word_embedding and print_info_about_hidden_states on a single row
outputs = generate_model_output(uva.iloc[0]['tokenized_text_with_id'], uva.iloc[0]['segment_ids'])
print_info_about_hidden_states(outputs)

Hidden states:
Number of layers: 13
Number of batches: 1
Number of tokens: 153
Number of hidden units: 768


In [46]:
#since we are feeding the model everything as a single sentece we can discard the batches
#also it would be nicer to reshape the hidden_states in a way that they are grouped by tokens instead of layers

def remove_batches_and_reshape_to_tokens(hidden_states):
    """
    Remove batches and reshape to tokens
    """
    token_embeddings = torch.stack(hidden_states, dim=0)
    #removing dimention 1 (batches)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    #grouping by tokens using permute
    token_embeddings = token_embeddings.permute(1, 0, 2) #(tokens, layers, hidden_units(features))

    return token_embeddings

In [67]:
#we have to strategies ahead of us
#1. we need to combine these token vectors in a way that they represent a word
#2. we need to combine these hidden states (features) in a way that they represent a sentence

#ways are either sum or cat
def combine_token_vectors_to_words(token_embeddings, how_many_from_last=4, how_to_combine='sum'):
    """
    Combine token vectors to words
    """
    token_vecs_combined = []
    if how_to_combine == 'sum':
        #summing the vectors
        for token in token_embeddings:
            summed = torch.sum(token_embeddings[:-how_many_from_last], dim=0)
            token_vecs_combined.append(summed)
    elif how_to_combine == 'cat':
        #concatenating the vectors
        for token in token_embeddings:
            #create a tuple form n last elements of the token
            last_elements = tuple(token[-how_many_from_last:])
            concatenated = torch.cat(last_elements, dim=0)
            token_vecs_combined.append(concatenated)
    return token_vecs_combined

def combine_token_vectors_to_sentence(hidden_states, start=2):
    """
    Combine token vectors to sentence using mean
    """
    hs = hidden_states[-start:][0]
    sentence_embedding = torch.mean(hs, dim=0)
    return sentence_embedding

In [73]:
def sentence_embedding_pipeline(indexed_token, segment_id):
    sentence_embedding = None
    try:
        outputs = generate_model_output(indexed_token, segment_id)
        hidden_states = remove_batches_and_reshape_to_tokens(outputs[2])
        sentence_embedding = combine_token_vectors_to_sentence(hidden_states)
    except:
        return None
    return sentence_embedding.numpy()

In [65]:
#test sentence_embedding_pipeline on a single row
sentence_embedding = sentence_embedding_pipeline(uva.iloc[0]['tokenized_text_with_id'], uva.iloc[0]['segment_ids'])
#convert tensor to array and print
sentence_embedding.size()

Shape of hs: torch.Size([13, 768])


torch.Size([768])

In [74]:
#generate sentence embedding for each row
uva['sentence_embedding'] = uva.apply(lambda x: sentence_embedding_pipeline(x['tokenized_text_with_id'], x['segment_ids']), axis=1)

uva.head()

Unnamed: 0,University,Abbreviation,Department,Course title,Unit,merged_clean,merged_clean_bert,tokenized_text,tokenized_text_with_id,segment_ids,sentence_embedding
0,Universiteit van Amsterdam,UvA,AUC,A Golden Age? History and Heritage of the Dutc...,6 EC,called golden age corresponds roughly 17th cen...,[CLS] called golden age corresponds roughly 17...,"[[CLS], called, golden, age, corresponds, roug...","[101, 2170, 3585, 2287, 14788, 5560, 5550, 230...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.73521274, 0.004391347, -0.2594675, 0.293618..."
1,Universiteit van Amsterdam,UvA,AUC,Academic Writing Skills,6 EC,academic writing skill introduction academic s...,[CLS] academic writing skill introduction acad...,"[[CLS], academic, writing, skill, introduction...","[101, 3834, 3015, 8066, 4955, 3834, 2817, 3192...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1.2182603, 0.35577005, 0.58682275, -0.8109705..."
2,Universiteit van Amsterdam,UvA,AUC,Adaptation Studies,6 EC,although originality work literature art often...,[CLS] although originality work literature art...,"[[CLS], although, original, ##ity, work, liter...","[101, 2348, 2434, 3012, 2147, 3906, 2396, 2411...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.61570364, 0.83801043, -0.2723579, -0.211583..."
3,Universiteit van Amsterdam,UvA,AUC,Addiction,6 EC,goal course gain insight etiology neurobiology...,[CLS] goal course gain insight etiology neurob...,"[[CLS], goal, course, gain, insight, et, ##iol...","[101, 3125, 2607, 5114, 12369, 3802, 20569, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.36432588, 0.49236968, 0.2854147, -1.2058219..."
4,Universiteit van Amsterdam,UvA,AUC,Advanced Creative Writing,6 EC,course continues diverse exploration creative ...,[CLS] course continues diverse exploration cre...,"[[CLS], course, continues, diverse, exploratio...","[101, 2607, 4247, 7578, 8993, 5541, 3015, 9009...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.61830354, 0.15859443, 0.35188314, -1.193507..."


In [79]:
#remove rows with None sentence embedding
uva = uva[uva['sentence_embedding'].notnull()]

In [80]:
uva.to_csv("data/uva_sentence_vectors.csv", index=False)


In [81]:
#importing clustering libraries
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score

In [93]:
def init_kmeans_clustering(x_sentence_vectors, n_clusters):
    """
    Init kmeans clustering
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    #reshape x_sentence_vectors to appropriate shape for kmeans
    x_sentence_vectors = x_sentence_vectors.reshape(-1, 1)
    kmeans.fit(x_sentence_vectors)
    return kmeans

#find majority department for each cluster and assign that department as its label
def assign_labels_to_clusters(kmeans, departments):
    """
    Assign labels to clusters
    """
    labels = kmeans.labels_
    #create a dictionary with key=cluster_id and value=department
    cluster_department = {}
    for i in range(len(labels)):
        cluster_id = labels[i]
        department = departments[i]
        if cluster_id in cluster_department:
            cluster_department[cluster_id].append(department)
        else:
            cluster_department[cluster_id] = [department]
    #assign the most frequent department to each entry in cluster_department
    for cluster_id in cluster_department:
        cluster_department[cluster_id] = max(set(cluster_department[cluster_id]), key=cluster_department[cluster_id].count)
    return cluster_department

In [95]:
#test clustering on 3000 rows
number_of_unique_departments = len(uva['Department'].unique())
sentence_embeddings = uva['sentence_embedding'].values
kmeans = init_kmeans_clustering(sentence_embedding[:3000], n_clusters=number_of_unique_departments)
cluster_department = assign_labels_to_clusters(kmeans, uva['Department'].values[:3000])
print(cluster_department)

{20: 'College of Economics and Business', 16: 'AUC', 10: 'AUC', 17: 'AUC', 8: 'AUC', 21: 'AUC', 6: 'AUC', 7: 'AUC', 9: 'College of Humanities', 0: 'AUC', 23: 'AUC', 2: 'AUC', 22: 'AUC', 13: 'College of Humanities', 4: 'AUC', 5: 'AUC', 14: 'College of Humanities', 11: 'College of Humanities', 24: 'AUC', 18: 'AUC', 1: 'College of Humanities', 15: 'Amsterdam Graduate Law School', 19: 'College of Communication', 12: 'Amsterdam Graduate Law School', 3: 'Amsterdam Graduate Law School'}
