In [1]:
import numpy as np
import pandas as pd
import torch
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./data.csv')

In [3]:
df

Unnamed: 0,book,words
0,./data/cook_book_one.txt,project gutenberg's the whitehouse cookbook by...
1,./data/cook_book_three.txt,the project gutenberg ebook of new royal cook ...
2,./data/gothic_novel_four.txt,the project gutenberg ebook of the works of ed...
3,./data/gothic_novel_six.txt,the project gutenberg ebook of northanger abbe...
4,./data/gothic_novel_two.txt,project gutenberg’s the complete works of will...
5,./data/gothic_novel_three.txt,the project gutenberg ebook of dracula by bram...
6,./data/cook_book_four.txt,the project gutenberg ebook of the italian coo...
7,./data/gothic_novel_ten.txt,the project gutenberg ebook of the castle of o...
8,./data/gothic_novel_eight.txt,the project gutenberg ebook of the vampyre a t...
9,./data/gothic_novel_nine.txt,the project gutenberg ebook of the masque of t...


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
text = ['love', 'hate']

In [28]:
encoded_input = tokenizer(text, return_tensors='pt')

In [29]:
encoded_input

{'input_ids': tensor([[ 101, 2293,  102],
        [ 101, 5223,  102]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1]])}

In [30]:
output = model(**encoded_input)

In [31]:
output.last_hidden_state.shape

torch.Size([2, 3, 768])

In [10]:
output.last_hidden_state

In [33]:
love_embedding = output.last_hidden_state[0][1]
hate_embedding = output.last_hidden_state[1][1]

In [34]:
love_embedding

tensor([ 3.8649e-01,  3.6188e-01,  2.3423e-01, -3.9580e-01,  9.3569e-01,
        -3.2042e-01,  2.0427e-01,  3.3845e-01, -5.2004e-02, -8.1070e-01,
        -5.6512e-01, -1.6136e-01,  3.0339e-01,  3.4091e-01, -4.5114e-01,
        -3.8642e-01,  5.0660e-01,  3.1144e-01,  8.6164e-01,  5.4277e-01,
        -2.0667e-01,  1.6041e-01, -5.3882e-01,  3.2438e-01, -1.7953e-02,
         5.0450e-01,  7.7202e-02, -2.5717e-01,  5.1554e-01,  3.3732e-01,
         7.2207e-01,  7.5058e-02, -1.5921e-01,  3.5300e-02, -8.6791e-01,
        -3.3094e-01,  3.1935e-01,  4.9276e-01, -1.0494e+00,  7.5135e-01,
         8.8906e-01, -4.1841e-01,  1.2820e-01, -9.1336e-01,  4.9203e-01,
        -4.7555e-02,  1.9765e-01,  5.5654e-01,  1.0032e-01, -2.4304e-01,
        -1.5505e-01,  3.8877e-01,  5.8610e-01,  2.5172e-01, -2.5282e-01,
         3.9431e-01,  5.3088e-01,  4.0181e-01, -6.0108e-01,  2.3436e-01,
         1.5554e-02, -2.9426e-01,  1.3882e-01, -4.3525e-01,  4.9706e-02,
         7.1831e-01,  5.0469e-01,  5.5972e-01, -1.2

In [36]:
x = [love_embedding.detach().numpy(), hate_embedding.detach().numpy()]
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [37]:
principalDf

Unnamed: 0,principal component 1,principal component 2
0,5.930534,9.285657e-16
1,-5.930534,9.285657e-16


In [11]:
# TODO: check for correct embedding
# TODO: make sure pca is working the way its supposed to be
# TODO: make good visuals (categories)
positive_word = 'love'
negative_word = 'hate'

In [12]:
encoded_positive_word = tokenizer(positive_word, return_tensors='pt')
encoded_negative_word = tokenizer(negative_word, return_tensors='pt')

In [13]:
output_positive = model(**encoded_positive_word)
output_negative = model(**encoded_negative_word)

In [21]:
negative = output_negative.last_hidden_state

In [22]:
positive = output_positive.last_hidden_state.reshape

In [26]:
negative[0][1]

tensor([ 3.5621e-01,  3.6247e-01, -2.2058e-01, -4.6860e-01, -7.1796e-02,
        -2.1023e-01,  8.5638e-02,  1.4948e-01, -5.2050e-01, -7.1825e-01,
        -2.4178e-01, -1.7741e-01, -7.3762e-02,  2.9040e-01, -4.5239e-01,
        -1.3743e-01,  4.0856e-01,  2.5147e-01,  6.3005e-02,  9.0867e-01,
         4.7119e-03,  4.3269e-02, -2.5664e-02,  1.8232e-01, -8.8962e-02,
         6.3563e-01, -1.9629e-01, -6.9848e-01, -3.2712e-01,  5.4574e-01,
         6.9029e-01,  1.8685e-01,  3.8625e-01,  3.7546e-01, -6.4495e-01,
        -2.7472e-01, -2.4221e-01,  4.1277e-01, -4.0662e-01,  1.4415e-01,
         3.0074e-01, -6.6900e-01,  3.3377e-03, -2.3528e-01,  3.3025e-01,
         6.1937e-01,  9.7317e-01,  6.1102e-01,  2.3256e-01, -3.3403e-01,
        -5.2249e-01,  1.6733e-01, -1.9197e-01, -4.3717e-02, -8.8047e-02,
        -2.8946e-01,  3.0785e-01,  1.4339e-01, -6.3604e-01, -1.7775e-01,
         7.7578e-02, -2.4443e-01,  3.6116e-01,  2.3975e-01, -1.6494e-01,
         8.1079e-01,  8.2113e-01,  4.7491e-01, -8.0