In [None]:
import tiktoken
import pandas as pd
import plotly.express as px
from tqdm import trange

import torch
import torch.nn as nn
from torchinfo import summary
from torch.utils.data import Dataset, DataLoader, random_split

In [18]:
data_df = pd.read_csv("hf://datasets/aengusbl/custom_AT-T_data/better_data.csv")
data_df.head()

Unnamed: 0,labels,docs
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Measuring the length of tokenised docs to see how many I'll split:

In [None]:
tokeniser = tiktoken.get_encoding("cl100k_base")

tokenised_df = data_df.copy()
tokenised_df["tokenised"] = tokenised_df["docs"].apply(lambda doc: tokeniser.encode(doc))
tokenised_df["tokenised_len"] = tokenised_df["tokenised"].apply(lambda doc: len(doc))
tokenised_df.head()

Unnamed: 0,labels,docs,tokenised,tokenised_len
0,ham,"Go until jurong point, crazy.. Available only ...","[11087, 3156, 16422, 647, 1486, 11, 14599, 497...",27
1,ham,Ok lar... Joking wif u oni...,"[11839, 45555, 1131, 622, 10979, 289, 333, 577...",11
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[11180, 4441, 304, 220, 17, 264, 74860, 398, 1...",50
3,ham,U dun say so early hor... U c already then say...,"[52, 50116, 2019, 779, 4216, 4917, 1131, 549, ...",13
4,ham,"Nah I don't think he goes to usf, he lives aro...","[45, 1494, 358, 1541, 956, 1781, 568, 5900, 31...",17


In [None]:
sorted_df = tokenised_df.sort_values(by="tokenised_len")
fig = px.line(y=sorted_df["tokenised_len"])
fig.show()

In [24]:
token_counts = tokenised_df["tokenised_len"].value_counts()
print(token_counts)

tokenised_len
9      294
8      270
7      248
11     246
10     234
      ... 
186      1
91       1
92       1
105      1
81       1
Name: count, Length: 106, dtype: int64


In [25]:
fig = px.bar(token_counts)
fig.show()

In [26]:
tokenised_df["tokenised_len"].describe()

count    5173.000000
mean       22.558670
std        17.147925
min         1.000000
25%        10.000000
50%        17.000000
75%        32.000000
max       259.000000
Name: tokenised_len, dtype: float64

In [None]:
# Data augmentation by splitting long strings into several.
# This allows me to get more value out of the little data I have, given the huge doc length differences, and without having ridiculous amounts of padding.

split_tokenised_dict = dict(labels = [], tokenised = [], tokenised_len = [])
split_threshold = 50 # arbitrary

for row_num in trange(len(tokenised_df)):
    row = tokenised_df.loc[row_num,:]
    if  row.tokenised_len < split_threshold:
        num_parts = 1
    elif row.tokenised_len < (split_threshold * 3):
        num_parts = 2
    else:
        num_parts = 3

    part_len = row.tokenised_len // num_parts
    for part in range(num_parts):
        split_tokenised_dict["labels"].append(row.labels)
        new_tokenised_doc = row.tokenised[(part*part_len):((part+1)*part_len)] # This works because you can give a slice out of range, and because `part*part_len == 0` at the start of range()
        split_tokenised_dict["tokenised"].append(new_tokenised_doc)
        split_tokenised_dict["tokenised_len"].append(len(new_tokenised_doc))

split_tokenised_df = pd.DataFrame(split_tokenised_dict)
split_tokenised_df.describe()

100%|██████████| 5173/5173 [00:00<00:00, 8310.69it/s]


Unnamed: 0,tokenised_len
count,5540.0
mean,21.032671
std,12.405142
min,1.0
25%,10.0
50%,18.0
75%,29.0
max,86.0


In [35]:
split_tokenised_df.head()

Unnamed: 0,labels,tokenised,tokenised_len
0,ham,"[11087, 3156, 16422, 647, 1486, 11, 14599, 497...",27
1,ham,"[11839, 45555, 1131, 622, 10979, 289, 333, 577...",11
2,spam,"[11180, 4441, 304, 220, 17, 264, 74860, 398, 1...",25
3,spam,"[15358, 311, 220, 25665, 1691, 311, 5371, 4441...",25
4,ham,"[52, 50116, 2019, 779, 4216, 4917, 1131, 549, ...",13


In [37]:
new_token_counts = split_tokenised_df["tokenised_len"].value_counts()
fig = px.bar(new_token_counts)
fig.show()

In [39]:
tokeniser.decode(split_tokenised_df.loc[5, "tokenised"])

"Nah I don't think he goes to usf, he lives around here though"

In [None]:
def normalise_sequence_len(sequences, max_length=50):
    return [seq[:max_length] + [0] * (max_length - len(seq)) for seq in sequences]

train_tokens = normalise_sequence_len(split_tokenised_df.tokenised) # no third "test" set
train_tokens[:5]

[[11087,
  3156,
  16422,
  647,
  1486,
  11,
  14599,
  497,
  16528,
  1193,
  304,
  10077,
  285,
  308,
  2294,
  1917,
  1208,
  384,
  61886,
  1131,
  356,
  483,
  1070,
  2751,
  72375,
  30125,
  1131,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [11839,
  45555,
  1131,
  622,
  10979,
  289,
  333,
  577,
  389,
  72,
  1131,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [11180,
  4441,
  304,
  220,
  17,
  264,
  74860,
  398,
  1391,
  311,
  3243,
  15358,
  11098,
  1620,
  17603,
  2641,
  220,
  1691,
  267,
  3297,
  220,
  1049,
  20,
  13,
  2991,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [15358,
  311,
  220,
  25665,
  1691,
  311,
  5371,
  4441,
  3488,
  

In [49]:
count = 0
for seq in train_tokens:
    if len(seq) != 50:
        count += 1
count

0

In [None]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

spam_dataset = SpamDataset(train_tokens, split_tokenised_df.labels.map(dict(spam=1, ham=0)))

train_size = int(0.8 * len(spam_dataset))
val_size = len(spam_dataset) - train_size
train_dataset, val_dataset = random_split(spam_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=embed_dim,
                                      padding_idx=0)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)
        pooled = self.pooling(embedded.permute(0, 2, 1)).squeeze(2)
        fc = self.fc(pooled)
        return torch.sigmoid(fc)

vocab_size = tokeniser.n_vocab

model = TextClassifier(vocab_size=vocab_size,
                      embed_dim=int(vocab_size**0.25), # see below why
                      num_class=1)

[Why `vocab_size**0.25`](https://developers.googleblog.com/en/introducing-tensorflow-feature-columns/#:~:text=the%20embedding%20vector%20dimension%20should%20be%20the%204th%20root%20of%20the%20number%20of%20categories). I am also confident in this choice because the result (~17) is very close to the value in the lesson (16).

In [118]:
sample_input = None
for batch in train_loader:
    sample_input = batch[0][0]
    break
sample_input

tensor([ 7530,   701, 25876,   612,  8091,    82,   389, 22725,   477,   220,
           16,    14,    17,  3430, 54088,  6430,   482, 63590,   323, 21533,
        50554, 31031,   449,   426,    14,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [119]:
sample_input = sample_input.unsqueeze(1)

In [None]:
print(model)

summary(model, input_data=sample_input)

TextClassifier(
  (embedding): Embedding(100277, 17, padding_idx=0)
  (pooling): AdaptiveAvgPool1d(output_size=1)
  (fc): Linear(in_features=17, out_features=1, bias=True)
)


Layer (type:depth-idx)                   Output Shape              Param #
TextClassifier                           [50, 1]                   --
├─Embedding: 1-1                         [50, 1, 17]               1,704,709
├─AdaptiveAvgPool1d: 1-2                 [50, 17, 1]               --
├─Linear: 1-3                            [50, 1]                   18
Total params: 1,704,727
Trainable params: 1,704,727
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 85.24
Input size (MB): 0.01
Forward/backward pass size (MB): 0.01
Params size (MB): 6.82
Estimated Total Size (MB): 6.84

## Transfer Learning

### torchtext

In [None]:
import torch, torchtext
from torch.utils.data import Dataset, DataLoader, random_split

import pandas as pd
from tqdm import trange
import tiktoken

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

In [20]:
data_df = pd.read_csv("hf://datasets/aengusbl/custom_AT-T_data/better_data.csv")
data_df.head()

Unnamed: 0,labels,docs
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
tokeniser = tiktoken.get_encoding("cl100k_base")

tokenised_df = data_df.copy()
tokenised_df["tokenised"] = tokenised_df["docs"].apply(lambda doc: tokeniser.encode(doc))
tokenised_df["tokenised_len"] = tokenised_df["tokenised"].apply(lambda doc: len(doc))
tokenised_df.head()

Unnamed: 0,labels,docs,tokenised,tokenised_len
0,ham,"Go until jurong point, crazy.. Available only ...","[11087, 3156, 16422, 647, 1486, 11, 14599, 497...",27
1,ham,Ok lar... Joking wif u oni...,"[11839, 45555, 1131, 622, 10979, 289, 333, 577...",11
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[11180, 4441, 304, 220, 17, 264, 74860, 398, 1...",50
3,ham,U dun say so early hor... U c already then say...,"[52, 50116, 2019, 779, 4216, 4917, 1131, 549, ...",13
4,ham,"Nah I don't think he goes to usf, he lives aro...","[45, 1494, 358, 1541, 956, 1781, 568, 5900, 31...",17


In [None]:
split_tokenised_dict = dict(labels = [], tokenised = [], tokenised_len = [])
split_threshold = 50 # arbitrary

for row_num in trange(len(tokenised_df)):
    row = tokenised_df.loc[row_num,:]
    if  row.tokenised_len < split_threshold:
        num_parts = 1
    elif row.tokenised_len < (split_threshold * 3):
        num_parts = 2
    else:
        num_parts = 3

    part_len = row.tokenised_len // num_parts
    for part in range(num_parts):
        split_tokenised_dict["labels"].append(row.labels)
        new_tokenised_doc = row.tokenised[(part*part_len):((part+1)*part_len)] # This works because you can give a slice out of range, and because `part*part_len == 0` at the start of range()
        split_tokenised_dict["tokenised"].append(new_tokenised_doc)
        split_tokenised_dict["tokenised_len"].append(len(new_tokenised_doc))

split_tokenised_df = pd.DataFrame(split_tokenised_dict)
split_tokenised_df.head()

100%|██████████| 5173/5173 [00:00<00:00, 5511.24it/s]


Unnamed: 0,labels,tokenised,tokenised_len
0,ham,"[11087, 3156, 16422, 647, 1486, 11, 14599, 497...",27
1,ham,"[11839, 45555, 1131, 622, 10979, 289, 333, 577...",11
2,spam,"[11180, 4441, 304, 220, 17, 264, 74860, 398, 1...",25
3,spam,"[15358, 311, 220, 25665, 1691, 311, 5371, 4441...",25
4,ham,"[52, 50116, 2019, 779, 4216, 4917, 1131, 549, ...",13


In [None]:
def normalise_sequence_len(sequences, max_length=50):
    return [seq[:max_length] + [0] * (max_length - len(seq)) for seq in sequences]

train_tokens = normalise_sequence_len(split_tokenised_df.tokenised) # no third "test" set

In [None]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

spam_dataset = SpamDataset(train_tokens, split_tokenised_df.labels.map(dict(spam=1, ham=0)))

train_size = int(0.8 * len(spam_dataset))
val_size = len(spam_dataset) - train_size
train_dataset, val_dataset = random_split(spam_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
xlmr_base = torchtext.models.XLMR_BASE_ENCODER
model = xlmr_base.get_model()
transform = xlmr_base.transform()
input_batch = ["Hello world", "How are you?"]
model_input = to_tensor(transform(input_batch), padding_value=1)
output = model(model_input)
output.shape

ModuleNotFoundError: No module named 'torchtext.functional'

This worked a lot better on Colab than on VSCode, but I still had a lot of versioning issues. I'll use a different model.

### google-bert/bert-base-multilingual-uncased

In [None]:
from transformers import BertTokenizer, BertModel

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torcheval.metrics.functional import binary_f1_score
from torchinfo import summary

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from tqdm import trange

import random
seed = 444719
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

In [13]:
data_df = pd.read_csv("hf://datasets/aengusbl/custom_AT-T_data/better_data.csv")
data_df.head()

Unnamed: 0,labels,docs
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df_with_lens = data_df.copy()
df_with_lens["len"] = df_with_lens["docs"].apply(lambda doc: len(doc))

In [16]:
# Data augmentation by splitting long strings into several.
# This allows me to get more value out of the little data I have, given the huge doc length differences, and without having ridiculous amounts of padding.

split_dict = dict(labels = [], docs = [], len = [])
split_threshold = 100

for row_num in trange(len(df_with_lens)):
    row = df_with_lens.loc[row_num,:]
    if  row.len < split_threshold:
        num_parts = 1
    elif row.len < (split_threshold * 3):
        num_parts = 2
    else:
        num_parts = 3

    part_len = row.len // num_parts
    for part in range(num_parts):
        split_dict["labels"].append(row.labels)
        new_doc = row.docs[(part*part_len):((part+1)*part_len)] # This works because you can give a slice out of range, and because `part*part_len == 0` at the start of range()
        split_dict["docs"].append(new_doc)
        split_dict["len"].append(len(new_doc))

split_df = pd.DataFrame(split_dict)
split_df.head()

  0%|          | 0/5173 [00:00<?, ?it/s]

100%|██████████| 5173/5173 [00:00<00:00, 8119.08it/s]


Unnamed: 0,labels,docs,len
0,ham,"Go until jurong point, crazy.. Available only ...",55
1,ham,n great world la e buffet... Cine there got am...,55
2,ham,Ok lar... Joking wif u oni...,29
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,77
4,spam,o 87121 to receive entry question(std txt rate...,77


In [17]:
fig = px.bar(split_df.len.value_counts())
fig.show()

In [18]:
tokeniser = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [None]:
# Based on the graph, I decided that the approximate string length I want my tokenised sequence max_len to correspond to is 150
docs_with_len_150 = split_df[split_df.len == 150]["docs"]
np.mean([len(tokeniser.encode(doc)) for doc in docs_with_len_150])

np.float64(50.666666666666664)

In [None]:
[len(tokeniser.encode(doc)) for doc in docs_with_len_150]

[51, 48, 53]

In [None]:
# Based on the lines above, I will set my tokenised sequence max_len to 51
train_tokens = tokeniser.batch_encode_plus(split_df['docs'].tolist(),
                                           padding=True,
                                           truncation=True,
                                           max_length=51,
                                           return_tensors='pt')
train_tokens

{'input_ids': tensor([[  101, 11335, 11573,  ...,     0,     0,     0],
        [  101,   156, 11838,  ...,     0,     0,     0],
        [  101, 13563, 29607,  ...,     0,     0,     0],
        ...,
        [  101, 10103, 16153,  ...,     0,     0,     0],
        [  101,   156, 95909,  ...,     0,     0,     0],
        [  101, 17675, 67533,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
class SpamDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx].detach().clone() for key, val in self.encoded_texts.items()}, self.labels[idx]

spam_dataset = SpamDataset(train_tokens, split_df.labels.map(dict(spam=1, ham=0)))

train_size = int(0.8 * len(spam_dataset))
val_size = len(spam_dataset) - train_size
train_dataset, val_dataset = random_split(spam_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
model = BertModel.from_pretrained("bert-base-multilingual-uncased")
print(model)

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [None]:
dummy_inputs = {
    'input_ids': torch.randint(0, 10000, (32, 51)),
    'attention_mask': torch.randint(0, 2, (32, 51)),
    'token_type_ids': torch.zeros((32, 51), dtype=torch.long)
}

summary(model, input_data=dummy_inputs)

Layer (type:depth-idx)                                  Output Shape              Param #
BertModel                                               [32, 768]                 --
├─BertEmbeddings: 1-1                                   [32, 51, 768]             --
│    └─Embedding: 2-1                                   [32, 51, 768]             81,315,072
│    └─Embedding: 2-2                                   [32, 51, 768]             1,536
│    └─Embedding: 2-3                                   [1, 51, 768]              393,216
│    └─LayerNorm: 2-4                                   [32, 51, 768]             1,536
│    └─Dropout: 2-5                                     [32, 51, 768]             --
├─BertEncoder: 1-2                                      [32, 51, 768]             --
│    └─ModuleList: 2-6                                  --                        --
│    │    └─BertLayer: 3-1                              [32, 51, 768]             7,087,872
│    │    └─BertLayer: 3-2        

I could not find any good way to *add* a classifier head to the model that I was satisfied with, \
so I opted to replace bert.pooler with itself + the contents of the classifier head I wanted to make, and it worked great:

In [None]:
model.pooler = nn.Sequential(
    model.pooler,
    nn.Linear(in_features=model.pooler.dense.out_features,
              out_features=1),
    nn.Sigmoid()
)

In [None]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [None]:
dummy_inputs = {
    'input_ids': torch.randint(0, 10000, (32, 51)),
    'attention_mask': torch.randint(0, 2, (32, 51)),
    'token_type_ids': torch.zeros((32, 51), dtype=torch.long)
}

summary(model, input_data=dummy_inputs)

Layer (type:depth-idx)                                  Output Shape              Param #
BertModel                                               [32, 1]                   --
├─BertEmbeddings: 1-1                                   [32, 51, 768]             --
│    └─Embedding: 2-1                                   [32, 51, 768]             81,315,072
│    └─Embedding: 2-2                                   [32, 51, 768]             1,536
│    └─Embedding: 2-3                                   [1, 51, 768]              393,216
│    └─LayerNorm: 2-4                                   [32, 51, 768]             1,536
│    └─Dropout: 2-5                                     [32, 51, 768]             --
├─BertEncoder: 1-2                                      [32, 51, 768]             --
│    └─ModuleList: 2-6                                  --                        --
│    │    └─BertLayer: 3-1                              [32, 51, 768]             7,087,872
│    │    └─BertLayer: 3-2        

In [None]:
# This is an example from the pytorch documentation. I used it to test the architecture with the new head.
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1040, -0.0468,  0.0248,  ...,  0.0191, -0.0564,  0.0362],
         [ 0.1217,  0.4089, -0.0559,  ..., -0.0666,  0.3187, -0.5159],
         [-0.0253,  0.0847,  0.0980,  ..., -0.0519,  0.0566,  0.0829],
         ...,
         [-0.1771, -0.1099,  0.3780,  ...,  0.3841,  0.0275,  0.4931],
         [-0.1495,  0.6881, -0.3445,  ...,  0.0686, -0.0732,  0.0065],
         [ 0.0823,  0.7530, -0.6249,  ...,  0.6885,  0.3545, -0.2615]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[0.5025]], grad_fn=<SigmoidBackward0>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)


This is what I want it to do. It takes the the CLS token and runs it through the Linear layer and then the sigmoid function.