In [4]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Col

In [5]:
import nltk
from nltk.corpus import reuters
from transformers import BertTokenizer,BertForSequenceClassification
import torch
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
from torch import nn,optim
import numpy as np

In [7]:
nltk.download('reuters')
documents = reuters.fileids()
train_docs_id = list(filter(lambda doc: doc.startswith('train'),documents))
test_docs_id = list(filter(lambda doc: doc.startswith('test'),documents))
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
print(f"Total Training documents: {len(train_docs)}")
print(f"Total Testing doucuments: {len(test_docs)}")

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Total Training documents: 7769
Total Testing doucuments: 3019


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
train_tokens = [tokenizer.encode(doc, add_special_tokens=True) for doc in train_docs[:100]]
train_labels = [0 if 'earn' in reuters.categories(doc_id) else 1 for doc_id in train_docs_id[:100]]

Token indices sequence length is longer than the specified maximum sequence length for this model (697 > 512). Running this sequence through the model will result in indexing errors


In [10]:
test_tokens = [tokenizer.encode(doc, add_special_tokens=True) for doc in test_docs[:50]]
test_labels = [0 if 'earn' in reuters.categories(doc_id) else 1 for doc_id in test_docs_id[:50]]

In [11]:
max_len = max(max(len(token) for token in train_tokens), max(len(token) for token in test_tokens))
train_tokens = [token + [0] * (max_len - len(token)) for token in train_tokens]
test_tokens = [token + [0] * (max_len - len(token)) for token in test_tokens]

In [12]:
train_tokens_tensor = torch.tensor(train_tokens)
train_labels_tensor = torch.tensor(train_labels)
test_tokens_tensor = torch.tensor(test_tokens)
test_labels_tensor = torch.tensor(test_labels)

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
print(model)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
batch_size = 8

train_data = TensorDataset(train_tokens_tensor, train_labels_tensor)
test_data = TensorDataset(test_tokens_tensor, test_labels_tensor)

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [18]:
# Pad or truncate tokens to fit the model's expected input size
max_len = 512  # BERT's maximum input size

def pad_or_truncate(tokens, max_len):
    if len(tokens) > max_len:
        return tokens[:max_len]
    else:
        return tokens + [0] * (max_len - len(tokens))

train_tokens = [pad_or_truncate(tokens, max_len) for tokens in train_tokens]
test_tokens = [pad_or_truncate(tokens, max_len) for tokens in test_tokens]

train_tokens_tensor = torch.tensor(train_tokens)
test_tokens_tensor = torch.tensor(test_tokens)

train_data = TensorDataset(train_tokens_tensor, train_labels_tensor)
test_data = TensorDataset(test_tokens_tensor, test_labels_tensor)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [19]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        outputs = model(inputs)[0]
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}%')

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Accuracy: 72.0%
