In [2]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.1 MB/s[0m eta [36m0:00:0

In [3]:
import pandas as pd
from transformers import BertTokenizer

In [10]:
data = pd.read_csv('descriptive.csv', encoding='cp1252')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003 entries, 0 to 1002
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Answer  1003 non-null   object
 1   Rate    1003 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [12]:
data.head()


Unnamed: 0,Answer,Rate
0,The action limit is reached when the acceptanc...,good
1,The action limit is the level of a parameter s...,average
2,Don’t know. May be it is theaction that holds ...,bad
3,Action limits or “action levels” means the min...,good
4,Values less than the minimum or greater than t...,good


In [6]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
# Tokenize and encode the text data
encoded_data = tokenizer.batch_encode_plus(
    data['Answer'].tolist(),
    add_special_tokens=True,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt')

In [33]:
# Extract input_ids, attention_masks, and labels
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = data['Rate'].tolist()


In [34]:
# Step 2: Split the dataset
from sklearn.model_selection import train_test_split

train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42
)

In [36]:
# Map labels to numerical values
label_map = {'good': 0, 'average': 1, 'bad': 2}
train_labels = [label.strip() for label in train_labels]  # Remove leading/trailing whitespaces
train_labels = [label_map[label] for label in train_labels]

# Step 3: Load the BERT model
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_input_ids))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# Step 4: Fine-tune the BERT model
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch  # Add this line to import the 'torch' library


In [38]:
# Create a DataLoader for the training set
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(train_labels))
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)


In [39]:
# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
for epoch in range(3):
    model.train()
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

In [None]:
# Step 5: Evaluate the model
# Create a DataLoader for the testing set
test_dataset = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(test_labels))
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=16)

In [None]:
# Evaluate the model on the testing set
model.eval()
total_eval_accuracy = 0
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    total_eval_accuracy += (preds == labels).sum().item()

accuracy = total_eval_accuracy / len(test_input_ids)
print("Accuracy:", accuracy)