<a href="https://colab.research.google.com/github/DataJenius/NLPEncodingExperiment/blob/main/python/NLPEncodingExperiment_get_BERT_ft_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
###########################################
# install all dependencies
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 16.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 41.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [None]:
###########################################
# load dependencies
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Split
import pandas as pd
import numpy as np
import os
import torch
from google.colab import files

In [None]:
###############################################
# use BERT tokenizer & classifier
auto_cls = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
auto_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


# define our tokenizer params here
# use max_length padding with 500 tokens
def tokenize(row):
    return auto_tokenizer(row['raw_text'], 
                          truncation=True, 
                          padding='max_length',
                          max_length=500)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
#################################################################################
# our labelled, raw comment data is on github
# already broken into 5 validation groups for the sake of 5 models
# use 20% of data for val, 20% for test, and keep 60% for training
df_links_to_data = pd.DataFrame({'my_group': [1,2,3,4,5], 
                                 'link': ['https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group1.csv',
                                          'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group2.csv',
                                          'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group3.csv',
                                          'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group4.csv',
                                          'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group5.csv']}) 

In [None]:
######################################################################################
# our data is already split into 5 groups, fully labelled

# setup this experiment
test_input_method = 'BERT (768)'

# split into val/test/train based on validation group 
df_train = df_links_to_data.loc[df_links_to_data['my_group'].isin([1,2,3])]
df_val = df_links_to_data.loc[df_links_to_data['my_group'] == 4]
df_test = df_links_to_data.loc[df_links_to_data['my_group'] == 5]


# load our train dataset (60%) 
train_dataset = load_dataset(path='csv', 
                             data_files=df_train['link'].values.tolist(),
                             split=Split.ALL).rename_column("label", "labels").remove_columns(['msg_id','token_count','my_group','my_role'])
print("\n\ntrain - group ",df_train['my_group'].unique())
print(train_dataset) 

# load our validation dataset (20%) 
val_dataset = load_dataset(path='csv', 
                           data_files=df_val['link'].values.tolist(),
                           split=Split.ALL).rename_column("label", "labels").remove_columns(['msg_id','token_count','my_group','my_role'])
print("\n\nval - group ",df_val['my_group'].unique())
print(val_dataset)   

# load our test dataset (20%) 
test_dataset = load_dataset(path='csv', 
                            data_files=df_test['link'].values.tolist(),
                            split=Split.ALL).rename_column("label", "labels")
print("\n\ntest - group ",df_test['my_group'].unique())
print(test_dataset)   


# tokenize the datasets
tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize, batched=True)

Using custom data configuration default-d560abb8ebad5240


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-d560abb8ebad5240/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/501k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/533k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d560abb8ebad5240/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


train - group  [1 2 3]
Dataset({
    features: ['labels', 'source', 'raw_text', 'clean_text'],
    num_rows: 6000
})


Using custom data configuration default-32643fbc024f8656


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-32643fbc024f8656/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/523k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-32643fbc024f8656/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


val - group  [4]
Dataset({
    features: ['labels', 'source', 'raw_text', 'clean_text'],
    num_rows: 2000
})


Using custom data configuration default-cba455d41c7e57e4


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-cba455d41c7e57e4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/530k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-cba455d41c7e57e4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


test - group  [5]
Dataset({
    features: ['msg_id', 'token_count', 'my_group', 'my_role', 'labels', 'source', 'raw_text', 'clean_text'],
    num_rows: 2000
})


  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
#################################################################################
# define our trainer and fine-tune BERT

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# takes about 4hrs for 1 epoch
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# set our training arguments
training_args = TrainingArguments(
    output_dir='output',
    num_train_epochs=10,             
    per_device_train_batch_size=1,
    per_device_eval_batch_size=8,
    evaluation_strategy='steps',
    eval_steps=250,
    logging_steps=250,
    gradient_accumulation_steps=8,
)

# functon to compute our metrics
def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

# define our trainer
trainer = Trainer(model=auto_cls, 
                  args=training_args, 
                  train_dataset=tokenized_train_dataset, 
                  eval_dataset=tokenized_val_dataset,
                  compute_metrics=compute_metrics)    

# make the magic happen
trainer.train()
#trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: raw_text, clean_text, source. If raw_text, clean_text, source are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6000
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 7500


Step,Training Loss,Validation Loss,Accuracy
250,0.0061,0.370641,0.9685
500,0.1354,0.216723,0.956
750,0.199,0.248574,0.9365
1000,0.1545,0.176738,0.9665
1250,0.1127,0.21704,0.959
1500,0.1107,0.211595,0.962
1750,0.0792,0.251239,0.9525
2000,0.0823,0.198303,0.964


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: raw_text, clean_text, source. If raw_text, clean_text, source are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: raw_text, clean_text, source. If raw_text, clean_text, source are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argum

In [None]:
###############################################
# save the model 
#folder_name = 'BERT_modelv1' # fine-tuned 1 epoch
folder_name = 'BERT_modelv10' # fine-tuned 10 epochs
print(folder_name)
trainer.save_model(folder_name)
os.listdir(folder_name)

In [None]:
############################################################################
# load our model #1
#folder_name = 'BERT_modelv1' # fine-tuned 1 epoch
folder_name = 'BERT_modelv10' # fine-tuned 10 epochs
config = AutoModelForSequenceClassification.from_pretrained(folder_name, output_hidden_states=True)
loaded_model = AutoModelForSequenceClassification.from_pretrained(folder_name, config=config)
print(loaded_model.device)

# get details about the model
def count_all_parameters(model):
    return sum(p.numel() for p in model.parameters())
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
num_params = count_all_parameters(loaded_model)
num_params_train = count_trainable_parameters(loaded_model)
print(loaded_model)
print(num_params, "total params")
print(num_params_train, "trainable params")

loading configuration file BERT_modelv1/config.json
Model config BertConfig {
  "_name_or_path": "BERT_modelv1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file BERT_modelv1/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSe

cpu
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [None]:
#################################################################################
# load our full dataset (100%) 
full_dataset = load_dataset(path='csv', 
                            data_files=df_links_to_data['link'].values.tolist(),
                            split=Split.ALL).rename_column("label", "labels")

#tokenize                          
tokenized_full_dataset = full_dataset.map(tokenize, batched=True)     

# turn tokenized test data into tensors
input_ids = torch.tensor(tokenized_full_dataset['input_ids'])
attention_mask = torch.tensor(tokenized_full_dataset['attention_mask'])
print(input_ids.shape)
print(attention_mask.shape)

Using custom data configuration default-71ca0bbdb5b5865a
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-71ca0bbdb5b5865a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-71ca0bbdb5b5865a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-3eb533fd8df890ab.arrow


torch.Size([10000, 500])
torch.Size([10000, 500])


In [None]:
#################################################################################
# forgive this goofy loop, I wanted to be able to sanity-check at each step
# loop through all 10,000 comments
# takes about 46 min to get fine-tuned BERT embeddings from them all
loaded_model.eval()
df_embeddings = pd.DataFrame()
j = 0
for i in range(0,10000,1):
  j += 1
  print(i, j)

  # this specific input as tensors
  my_input_ids = input_ids[i:i+1]
  my_attention_mask = attention_mask[i:i+1]

  # predict class for this comment
  my_output = loaded_model(input_ids=my_input_ids,
                        attention_mask=my_attention_mask,
                        output_hidden_states=True)

  # get the final 768-dimension [CLS] embedding for the comment according to BERT
  my_hidden_states = torch.stack(list(my_output["hidden_states"]), dim=0)
  my_cls = my_hidden_states[12][0][0]
  
  # put it into a dataframe for use elsewhere
  values = my_cls.squeeze().tolist()
  df_my_embeddings = pd.DataFrame({'value': values}).transpose()
  df_my_embeddings["msg_id"] = full_dataset["msg_id"][i]
  df_my_embeddings.index=[i]
  df_embeddings = df_embeddings.append(df_my_embeddings) 

  # save every 1000 records
  if j == 1000:

      # save results to local CSV
      file_name = 'BERT_ft_embeddings_i'+str(i)+'.csv'
      df_embeddings.to_csv(file_name, index=False) 
      files.download(file_name)
      print("\n\nsave progress...\n\n",df_embeddings.head(10))      

      # reset df and j counter
      df_embeddings = pd.DataFrame()
      j=0

  #if i == 1:
  #  break    


0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
20 21
21 22
22 23
23 24
24 25
25 26
26 27
27 28
28 29
29 30
30 31
31 32
32 33
33 34
34 35
35 36
36 37
37 38
38 39
39 40
40 41
41 42
42 43
43 44
44 45
45 46
46 47
47 48
48 49
49 50
50 51
51 52
52 53
53 54
54 55
55 56
56 57
57 58
58 59
59 60
60 61
61 62
62 63
63 64
64 65
65 66
66 67
67 68
68 69
69 70
70 71
71 72
72 73
73 74
74 75
75 76
76 77
77 78
78 79
79 80
80 81
81 82
82 83
83 84
84 85
85 86
86 87
87 88
88 89
89 90
90 91
91 92
92 93
93 94
94 95
95 96
96 97
97 98
98 99
99 100
100 101
101 102
102 103
103 104
104 105
105 106
106 107
107 108
108 109
109 110
110 111
111 112
112 113
113 114
114 115
115 116
116 117
117 118
118 119
119 120
120 121
121 122
122 123
123 124
124 125
125 126
126 127
127 128
128 129
129 130
130 131
131 132
132 133
133 134
134 135
135 136
136 137
137 138
138 139
139 140
140 141
141 142
142 143
143 144
144 145
145 146
146 147
147 148
148 149
149 150
150 151
151 152
15

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

           0         1         2         3         4         5         6  \
0  0.047403 -0.051700 -0.365815  0.148061 -0.535689  0.223934  1.254592   
1 -0.009199 -0.072623 -0.357462  0.161104 -0.554575  0.178139  1.274894   
2 -0.075791 -0.158615  0.679475  0.216228  0.701845 -0.268213 -0.595190   
3 -0.041176 -0.171742  0.641455  0.197139  0.736751 -0.294790 -0.594111   
4  0.007319 -0.064146 -0.330732  0.163415 -0.545964  0.166491  1.268156   
5 -0.456374  0.029407  0.329871  0.465754  0.065535 -0.200851  0.852477   
6 -0.153701 -0.423255  0.115487  0.261997  0.803537 -0.122127 -0.539995   
7  0.024753 -0.065737 -0.338129  0.156204 -0.546920  0.186542  1.250067   
8 -0.095647  0.008245 -0.031380  0.152259 -0.463306  0.042561  1.305223   
9  0.021871 -0.078406 -0.310470  0.139376 -0.572395  0.181374  1.276895   

          7         8         9  ...       759       760       761       762  \
0 -0.656984  0.095141  0.663508  ...  0.769508 -0.696395  0.017928 -0.924

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
1000 -0.076617 -0.079512  0.678091  0.253852  0.672184 -0.243289 -0.553306   
1001 -0.208444 -0.054092  0.037184  0.125786 -0.486008 -0.000469  1.252948   
1002 -0.134418  0.000303 -0.033622  0.197651 -0.491967 -0.050800  1.232461   
1003 -0.163986 -0.354502  0.144954  0.187863  0.702761 -0.216946 -0.515936   
1004 -0.048906 -0.128100  0.640538  0.218397  0.784220 -0.288315 -0.602061   
1005 -0.073433 -0.132093  0.717511  0.248953  0.674450 -0.282305 -0.558178   
1006 -0.235726 -0.400476  0.499011  0.363796  0.674663 -0.133587 -0.510297   
1007 -0.055283 -0.144063  0.677506  0.227152  0.685081 -0.273177 -0.583244   
1008  0.023896 -0.063710 -0.368099  0.144216 -0.545548  0.216942  1.273014   
1009 -0.039841 -0.072685 -0.339233  0.180888 -0.592082  0.159623  1.262841   

             7         8         9  ...       759       760       761  \
1000  0.405871  0.028895 -0.362160  ... -0.8617

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
2000  0.005469 -0.076513 -0.336502  0.153661 -0.574454  0.169091  1.239804   
2001 -0.165560 -0.023872 -0.056493  0.132135 -0.480145 -0.010649  1.233688   
2002 -0.028628 -0.429660 -0.020286  0.435175  0.775406  0.128116 -0.805316   
2003 -0.065206 -0.161719  0.662603  0.240480  0.713299 -0.275522 -0.558207   
2004 -0.073336 -0.100715  0.505823  0.368323  0.009497 -0.027840 -0.303702   
2005 -0.032541 -0.163425  0.639748  0.212491  0.706564 -0.325365 -0.594460   
2006 -0.084401 -0.126538  0.650133  0.213091  0.757732 -0.257949 -0.579193   
2007 -0.022275 -0.106932 -0.326927  0.144666 -0.548994  0.161256  1.287014   
2008 -0.255868  0.022378 -0.004718  0.272886 -0.349142 -0.096208  1.093685   
2009 -0.581784 -0.042619  0.313615  0.151453 -0.168833 -0.235932  0.836050   

             7         8         9  ...       759       760       761  \
2000 -0.668556  0.116870  0.669701  ...  0.7674

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
3000 -0.013145 -0.064757 -0.341882  0.156354 -0.574484  0.170522  1.300277   
3001  0.047956 -0.076486 -0.364326  0.139658 -0.564344  0.219335  1.253211   
3002 -0.021638 -0.087178 -0.361487  0.135706 -0.567926  0.171612  1.254246   
3003 -0.040788 -0.130775  0.575563  0.203545  0.668503 -0.274114 -0.590965   
3004 -0.005542 -0.088995 -0.352314  0.140737 -0.550516  0.190710  1.241860   
3005  0.063501 -0.054335 -0.329905  0.156595 -0.572696  0.200622  1.263624   
3006 -0.167726 -0.277788  0.512353  0.250155  0.670037 -0.348311 -0.362100   
3007 -0.073776 -0.141205  0.616802  0.217756  0.616937 -0.258630 -0.612014   
3008  0.042448 -0.075414 -0.360134  0.138895 -0.554197  0.201521  1.236603   
3009 -0.016221 -0.136550  0.624375  0.208005  0.750401 -0.279632 -0.614678   

             7         8         9  ...       759       760       761  \
3000 -0.614783  0.113542  0.700161  ...  0.7919

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
4000 -0.099103 -0.231657  0.715037  0.134745  0.765859 -0.261267 -0.639148   
4001  0.018195 -0.056093 -0.353168  0.157355 -0.585839  0.188825  1.275454   
4002 -0.099778 -0.086972 -0.225541  0.187272 -0.562462  0.089015  1.267817   
4003 -0.078004 -0.114887  0.623378  0.237884  0.688903 -0.277865 -0.578165   
4004  0.015603 -0.073519 -0.332254  0.155135 -0.548789  0.189578  1.264958   
4005  0.044965 -0.059927 -0.353372  0.147541 -0.568943  0.212081  1.245193   
4006 -0.076794 -0.171824  0.688072  0.311795  0.862059 -0.325815 -0.629555   
4007  0.056592 -0.074563 -0.355355  0.146344 -0.568326  0.229051  1.251293   
4008 -0.597012 -0.110656  0.491079  0.237348 -0.183073  0.100042  0.815405   
4009 -0.096286 -0.192798  0.710272  0.228921  0.717906 -0.314082 -0.563301   

             7         8         9  ...       759       760       761  \
4000  0.261564 -0.023038 -0.440526  ... -0.9535

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
5000  0.010492 -0.059564 -0.343357  0.161458 -0.581498  0.165689  1.232327   
5001 -0.133291 -0.175255  0.664811  0.288459  0.688914 -0.275019 -0.566562   
5002 -0.026812 -0.130579  0.610434  0.240878  0.698178 -0.305897 -0.537891   
5003 -0.075246 -0.159503  0.674112  0.233223  0.640369 -0.259334 -0.567438   
5004 -0.013113 -0.104287 -0.301990  0.157671 -0.549026  0.170627  1.316935   
5005  0.065065 -0.076810 -0.359695  0.150848 -0.573844  0.222522  1.257764   
5006  0.000262 -0.070473 -0.299610  0.157914 -0.622719  0.140944  1.246255   
5007 -0.062417 -0.047438 -0.286761  0.177349 -0.606003  0.125808  1.286082   
5008 -0.095654 -0.073637 -0.264113  0.154572 -0.552150  0.096177  1.244796   
5009 -0.052767 -0.122980  0.662093  0.223249  0.735322 -0.245810 -0.591529   

             7         8         9  ...       759       760       761  \
5000 -0.679613  0.091151  0.696934  ...  0.7729

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
6000  0.052042 -0.043238 -0.367038  0.160670 -0.543422  0.218243  1.271230   
6001 -0.029673 -0.065003 -0.320025  0.165220 -0.583193  0.128707  1.241753   
6002 -0.087953 -0.195586  0.737425  0.249499  0.712139 -0.296546 -0.575816   
6003 -0.075275 -0.487686  0.029573  0.429154  0.866290  0.169044 -0.722161   
6004  0.025954 -0.397346  0.159099  0.472279  0.505572 -0.096603 -0.615083   
6005 -0.049029 -0.372758  0.122969  0.400248  0.523027  0.039667  0.531790   
6006 -0.038825 -0.409694  0.492246  0.456342  0.589208 -0.034734 -0.490340   
6007 -0.034462 -0.080251 -0.306118  0.177480 -0.602492  0.123308  1.235121   
6008 -0.039342 -0.037706 -0.274076  0.135244 -0.603042  0.152182  1.320529   
6009 -0.138401 -0.091258 -0.237846  0.172810 -0.530355  0.028569  1.214504   

             7         8         9  ...       759       760       761  \
6000 -0.661161  0.101309  0.669490  ...  0.7656

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
7000 -0.085301 -0.524406  0.118531  0.353035  0.633389 -0.064060 -0.564704   
7001 -0.102676 -0.041623 -0.262950  0.183374 -0.594818  0.072023  1.252933   
7002 -0.069960 -0.128233  0.708795  0.212609  0.650095 -0.274846 -0.533631   
7003 -0.070274 -0.034218 -0.254498  0.171914 -0.484056  0.115358  1.255841   
7004 -0.512146  0.175603  0.128629  0.536516 -0.037467 -0.323950  1.069366   
7005 -0.186326 -0.390368  0.610435  0.294989  0.714304 -0.290511 -0.601276   
7006 -0.071503 -0.105876  0.683012  0.235713  0.702156 -0.285510 -0.537997   
7007 -0.108698  0.076926 -0.204961  0.272762 -0.587937  0.031863  1.312326   
7008 -0.177345 -0.007214 -0.174012  0.267579 -0.607512 -0.003975  1.222575   
7009 -0.316962  0.174948  0.012495  0.336792 -0.191524 -0.143350  0.873626   

             7         8         9  ...       759       760       761  \
7000 -0.135269 -0.140839 -0.425974  ... -0.7134

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
8000  0.057927 -0.287445  0.342658  0.392008  0.565954  0.043023 -0.647841   
8001 -0.048859 -0.163370  0.622231  0.177224  0.663643 -0.275797 -0.553731   
8002  0.062954 -0.064414 -0.364925  0.137911 -0.569051  0.233845  1.257622   
8003 -0.145357 -0.251255  0.637669  0.234555  0.707810 -0.330420 -0.578522   
8004 -0.005872 -0.075093 -0.315126  0.192450 -0.557971  0.157828  1.260165   
8005 -0.027792 -0.465784  0.389712  0.285481  0.547823 -0.221042 -0.640480   
8006  0.050902 -0.061339 -0.365108  0.138145 -0.569240  0.224259  1.258135   
8007 -0.494742 -0.334001  0.492801  0.011450 -0.040505 -0.131940  0.724362   
8008  0.015104 -0.061483 -0.342582  0.134308 -0.578252  0.184926  1.258627   
8009 -0.114479 -0.433461  0.581285  0.246159  0.629216 -0.342956 -0.517615   

             7         8         9  ...       759       760       761  \
8000 -0.253957 -0.006036 -0.580096  ... -0.4878

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



save progress...

              0         1         2         3         4         5         6  \
9000  0.055796 -0.049115 -0.363718  0.168785 -0.573163  0.209320  1.257403   
9001 -0.047628 -0.089915  0.647658  0.241972  0.730335 -0.260990 -0.548218   
9002 -0.023620 -0.095473  0.595040  0.186929  0.702979 -0.305454 -0.574254   
9003  0.035860 -0.076302 -0.365307  0.123614 -0.552568  0.220753  1.268197   
9004 -0.018650 -0.067399 -0.323692  0.164673 -0.570434  0.170183  1.274333   
9005 -0.166006 -0.143967  0.229506  0.203448  0.025793  0.009663  0.540498   
9006 -0.098709 -0.075307  0.706803  0.213221  0.716305 -0.259522 -0.543374   
9007 -0.100227 -0.300119  0.134920  0.262598  0.679394 -0.046251 -0.546484   
9008 -0.066851 -0.127621  0.663375  0.194584  0.641335 -0.286178 -0.551678   
9009 -0.068170 -0.512117  0.258940  0.376254  0.683081 -0.128783 -0.536922   

             7         8         9  ...       759       760       761  \
9000 -0.656502  0.092077  0.686333  ...  0.7579

In [None]:
print(df_embeddings.head)

<bound method NDFrame.head of           0         1         2         3         4         5         6  \
0  0.047403 -0.051700 -0.365815  0.148061 -0.535689  0.223934  1.254592   
1 -0.009199 -0.072623 -0.357462  0.161104 -0.554575  0.178139  1.274894   

          7         8         9  ...       759       760       761       762  \
0 -0.656984  0.095141  0.663508  ...  0.769508 -0.696395  0.017928 -0.924579   
1 -0.630651  0.109574  0.713033  ...  0.785477 -0.747074 -0.003504 -0.898637   

        763       764       765       766       767  msg_id  
0 -0.223866 -0.158940  0.204260  1.276494 -0.904730   11702  
1 -0.197978 -0.183148  0.178438  1.259046 -0.945903    3854  

[2 rows x 769 columns]>


In [None]:
############################################################################
#string_to_check = 'I love Luke Skywalker, but I hate Gandalf the Grey.'
#string_to_check = 'I love Gandalf the Grey, but I hate Luke Skywalker.'
string_to_check = 'Captain Smurgleblorp snorged his 😜 on Blursday.'
print(string_to_check)

# encode into IDs for each token
encoded = auto_tokenizer.encode(string_to_check)
print(encoded)

# convert IDs into their tokens
tokens = auto_tokenizer.convert_ids_to_tokens(encoded)
print(tokens)

# convert back into a sentence
decoded = auto_tokenizer.decode(encoded)
print(decoded)

Captain Smurgleblorp snorged his 😜 on Blursday.
[101, 2952, 15488, 12514, 2571, 16558, 2953, 2361, 1055, 12131, 5999, 2010, 100, 2006, 14819, 16150, 4710, 1012, 102]
['[CLS]', 'captain', 'sm', '##urg', '##le', '##bl', '##or', '##p', 's', '##nor', '##ged', 'his', '[UNK]', 'on', 'blur', '##sd', '##ay', '.', '[SEP]']
[CLS] captain smurgleblorp snorged his [UNK] on blursday. [SEP]
