# Text embedding

In [1]:
import pandas as pd
import torch
import spacy
from tqdm.notebook import tqdm

from bert import BERT

In [2]:
dir_in = '../../data/raw/'
dir_out = '../../data/embeded/'
dataset_name = '10000_amazon_reviews'

df = pd.read_csv(dir_in + dataset_name + '.csv')
text_column = 'text'
label_column = 'is_negative'

In [3]:
df.head(3)

Unnamed: 0,text,is_negative
0,I ordered these short for my husband and he lo...,0
1,NineWest certainly know how to make great shoe...,0
2,I looked all over for these shapers. These ar...,0


## Compute BERT embeddings
1. Split a document into sentences 
2. Compute embeddings for each token of a sentence (last 12th layer output) 
3. Average all token embeddings of a sentence to compute BERT sentence embedding.
4. Average all Sentence embeddings to compute document representation.


In [5]:
# initialize pre-trained BERT model
bert = BERT()

# # python3 -m spacy download en_core_web_sm
spacy_nlp = spacy.load('en_core_web_sm')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [12]:
data_new = []
for _, row in tqdm(df.iterrows(), total=len(df), disable=False):
    label = row[label_column]
    text = row[text_column]
    sentences = spacy_nlp(text).sents
    sentences_vecs = []
    for s in sentences:
        s_vec = bert.compute_sentence_embedding(str(s))
        sentences_vecs.append(s_vec)
    sentences_vecs = torch.stack(sentences_vecs)
    text_embedding = torch.mean(sentences_vecs, dim=0).tolist()
    data_new.append([text_embedding, label])

pd.DataFrame(data_new, columns=['bert_embedding_mean', 'label']).to_csv(dir_out+dataset_name+'_bert.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=10002.0), HTML(value='')))


