In [1]:
# masking
import transformers
from transformers import pipeline
import pandas as pd

# embedding
import torch
from transformers import BertTokenizer, BertModel
import logging
logging.basicConfig(level=logging.INFO)
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import cosine
import seaborn as sns
%matplotlib inline

In [2]:
# initialize MLM pipeline
mlm = pipeline('fill-mask', model='bert-base-cased', top_k=15)

# get mask token
mask = mlm.tokenizer.mask_token

# get result for particular masked phrase
def predict(phrase):
    
    predicted = mlm(phrase.format(mask))
    result = pd.DataFrame(predicted)
    result.drop('token', axis=1, inplace=True)
    
    return result

predict("Hi, my name is {}.")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,sequence,score,token_str
0,"Hi, my name is Amy.",0.008801,Amy
1,"Hi, my name is Kate.",0.007974,Kate
2,"Hi, my name is Emily.",0.007432,Emily
3,"Hi, my name is Sam.",0.007135,Sam
4,"Hi, my name is Sarah.",0.006704,Sarah
5,"Hi, my name is Claire.",0.006109,Claire
6,"Hi, my name is Julie.",0.006059,Julie
7,"Hi, my name is Alice.",0.005701,Alice
8,"Hi, my name is Mia.",0.005455,Mia
9,"Hi, my name is Emma.",0.004981,Emma


In [3]:
# embed word using sentence context
# much of the code in this section comes from Arushi Prakash; credits in bib doc

# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# load pre-trained model
model = BertModel.from_pretrained('bert-base-cased', output_hidden_states = True)

# set model to evaluation mode
model.eval()

def prepare_sent(sent):

    marked_text = "[CLS] " + sent + " [SEP]"

    # tokenize sentence with BERT tokenizer.
    tokenized_text = tokenizer.tokenize(marked_text)

    # map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    # convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    return tokenized_text, tokens_tensor, segments_tensors

def get_bert_embeddings(tokens_tensor, segments_tensors):
    
    # gradient calculation id disabled, model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2][1:]

    # get embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    
    # collapse the tensor into 1 dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    
    # convert torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

def embed_target(sent, word):
    
    tokenized_text, tokens_tensor, segments_tensors = prepare_sent(sent)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors)
    
    # find position of word in list of tokens
    if word in tokenized_text:
        word_index = tokenized_text.index(word)
    else:
        print('FLAG')
        return 1
    
    # get the embedding for word
    word_embedding = list_token_embeddings[word_index]
        
    return word_embedding

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# arrange the embedding around 2 new dimensions
# much of the code in this section comes from Ted Underwood; credits in bib doc

def get_vectors(sent_word_pairs):
    
    vectors = []
    
    for sent, targ in sent_word_pairs.items():          # for each word in a wordlist
        vec = embed_target(sent, targ)
        vectorlength = np.linalg.norm(vec, ord = 2)     # normalize length
        vectors.append(vec / vectorlength)              # and save it in a list
        
    thesum = np.sum(vectors, axis = 0)                  # then add all the vectors
    vectorlength = np.linalg.norm(thesum, ord = 2)      # normalize length again
    
    return thesum / vectorlength

def make_direction(positive_examples, negative_examples):
    
    posvector = get_vectors(positive_examples)           # create a mean positive vector
    negvector = get_vectors(negative_examples)           # and negative vector
    direction = posvector - negvector                    # subtract the second from the first
    
    return direction

def plot_the_frame(plotframe):   
    
    plt.figure(figsize = (10, 8))
    theplot = sns.scatterplot(x = plotframe.iloc[ : , 0], y = plotframe.iloc[ : , 1])
    collabels = plotframe.columns.tolist()
    theplot.set(xlabel = collabels[0], ylabel = collabels[1])

    for rownum in range(plotframe.shape[0]):
        x = plotframe.iloc[rownum, 0]
        y = plotframe.iloc[rownum, 1]
        theplot.text(x + 0.007, y + .007, 
        plotframe.index[rownum], horizontalalignment='left', 
        size='medium', color='black', weight='semibold')

    plt.show()

def organizeby2dims(sent_word_pairs, ref_vector1, ref_vector2, col1name, col2name):
    
    dict4df = dict()
    words = list(sent_word_pairs.values())
    
    for colname in [col1name, col2name]:      # create a dictionary with 
        dict4df[colname] = []                 # appropriate key names
        
    for s, w in sent_word_pairs.items():      # for each word       
        vec = embed_target(s, w)
        cos1 = cosine(vec, ref_vector1)       # get its x position
        cos2 = cosine(vec, ref_vector2)       # and y position
        
        dict4df[col1name].append(cos1)
        dict4df[col2name].append(cos2)

    plotframe = pd.DataFrame(dict4df, index = words)  # make this a data frame,
                                                      # with words as index labels
    plot_the_frame(plotframe)
    
    return plotframe

animate = {'I feel completely alive.' : 'alive',
           'He is my friend.' : 'friend',
           'A man must do his best.' : 'man',
           'If anyone can do it, he can.' : 'he',
           'If anyone can do it, she can' : 'she',
           'I am a human being' : 'human',
           'As he is a person, he is granted rights.' : 'person',
           'Humans are intelligent creatures' : 'intelligent',
           'Nothing is more important in life than family.' : 'family'}

inanimate = {'I cannot use it.' : 'it',
             'When an animal is dead, it no longer feels any pain.' : 'dead',
             'I turned the rock over to see what was inside' : 'rock',
             'Get that thing away from me.' : 'thing',
             'Be careful when you lift that heavy object.' : 'object',
             'The trees are planted on irrigated dirt and the fruit gathered between November and August.' : 'dirt',
             'These laws are intended to help preserve our natural resources.' : 'resources'}

animacy = make_direction(inanimate, animate)

human = {'there is nothing more powerful than a human' : 'human'}
nonhuman = {'i turned a stone over to see what was underneath it.' : 'stone'}

humanness = make_direction(nonhuman, human)

def zip_and_embed(sent_list, word_list):
    
    zipped = dict(zip(sent_list, word_list))
    plotframe = organizeby2dims(zipped, animacy, humanness, 'animacy', 'humanness')
    
    return plotframe['animacy'].to_list()

In [5]:
# run the pipeline

def pipeline(directory, result_file):
    
    # construct master frame
    frame = pd.read_csv(directory, sep='\t', index_col=[0])
    
    # create columns; must be done to pre-set the type
    frame['prediction'] = 0
    frame['full_prediction'] = 0
    frame['probability'] = 0
    frame['animacy'] = 0
    
    # pre-set column type as object to accommodate lists
    frame['prediction'] = frame['prediction'].astype(object)
    frame['full_prediction'] = frame['full_prediction'].astype(object)
    frame['probability'] = frame['probability'].astype(object)
    frame['animacy'] = frame['animacy'].astype(object)
    
    # extract masked sentences for prediction
    sentences = frame['MaskedSentence'].to_list()
    
    # make predictions
    length = len(frame)
    for x in range(0, length):
        
        sentence = sentences[x]

        sentence_predict_df = predict(sentence)
        prediction = sentence_predict_df['token_str'].to_list()
        full_prediction = sentence_predict_df['sequence'].to_list()
        probabilities = sentence_predict_df['score'].to_list()

        frame.at[x, 'prediction'] = prediction
        frame.at[x, 'full_prediction'] = full_prediction
        frame.at[x, 'probability'] = probabilities
    
        #Run embedding
        full_predictions = frame['full_prediction'].to_list()[x]
        predictions = frame['prediction'].to_list()[x]
        animacy_embeddings = zip_and_embed(full_predictions, predictions)
        
        frame.at[x, 'animacy'] = animacy_embeddings
    
        # weight animacy with probability
        frame.at[x, 'avg_animacy'] = np.average(animacy_embeddings,weights=probabilities)
        #frame.to_csv('/home/dustin/Python/Animacies/results/test.csv')
        percentage = round(x / length * 100, 2)
        print(percentage, '%', end='\r')
    
    print('100%')
        
    # reorder frame by weight and save
    final_results = frame.sort_values('avg_animacy')
    final_results.to_csv('/home/dustin/Python/Animacies/results/' + result_file + '.csv')

    return None

pipeline('/home/dustin/Python/Animacies/filtered/bryophytes.tsv', 'bryophytes')
pipeline('/home/dustin/Python/Animacies/filtered/sweetgrass.tsv', 'sweetgrass')

FLAG3 %
FLAG6 %
FLAG4 %
FLAG2 %
FLAG %
FLAG4 %
FLAG8 %
FLAG7 %
FLAG9 %
FLAG5 %
FLAG3 %
FLAG
FLAG1 %
FLAG9 %
FLAG3 %
FLAG %%
FLAG8 %
FLAG6 %
FLAG %%
FLAG6 %
FLAG8 %
FLAG3 %
FLAG5 %
FLAG3 %
100%2 %
FLAG7 %
100%7 %
