# Bert for Text Classification
## Name: Srinitish Srinivasan
## Reg.No: 21BAI1394

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os 
from dotenv import load_dotenv

load_dotenv('.env')
path=os.getenv("spam_classification")

dataset=pd.read_csv(path,encoding='ISO-8859-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
#Map Target train and test to 0 and 1 from no spam and spam
mapping={
    'spam':1,
    'ham':0
}

dataset['v1']=dataset['v1'].map(mapping)


In [4]:
text,labels=dataset['v2'].tolist(),dataset['v1'].tolist()
text[0:20],labels[0:20]

(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'Ok lar... Joking wif u oni...',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  'U dun say so early hor... U c already then say...',
  "Nah I don't think he goes to usf, he lives around here though",
  "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv",
  'Even my brother is not like to speak with me. They treat me like aids patent.',
  "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
  'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

In [5]:
class SpamClassificationDataset(Dataset):
    def __init__(self,text,labels,tokenizer,max_length):
        self.texts=text 
        self.labels=labels 
        self.tokenizer=tokenizer
        self.max_length =max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,idx):
        text=self.texts[idx]
        label=self.labels[idx]

        encoding=self.tokenizer(text,return_tensors='pt',max_length=self.max_length,padding='max_length',truncation=True)

        return {
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'label':torch.tensor(label)
        }

In [6]:
class SpamClassifierModel(nn.Module):
    def __init__(self,bert_model_name,num_classes):
        super(SpamClassifierModel,self).__init__()

        self.bert=BertModel.from_pretrained(bert_model_name)
        self.dropout=nn.Dropout(p=0.2)
        self.fc=nn.Linear(self.bert.config.hidden_size,num_classes)

    def forward(self,input_ids,attention_mask):
        outputs=self.bert(input_ids=input_ids,attention_mask=attention_mask)
        pooled_output=outputs.pooler_output
        x=self.dropout(pooled_output)
        logits=self.fc(x)

        return logits

In [7]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [8]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [9]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return "Spam" if preds.item() == 1 else "Ham"

In [10]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [11]:
train_texts, val_texts, train_labels, val_labels = train_test_split(text, labels, test_size=0.2, random_state=42)

In [12]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = SpamClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = SpamClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)



In [13]:
device = torch.device("mps")
model =SpamClassifierModel(bert_model_name, num_classes).to(device)
model

SpamClassifierModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [14]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [15]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/4
Validation Accuracy: 0.9928
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       965
           1       0.97      0.97      0.97       150

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115

Epoch 2/4
Validation Accuracy: 0.9937
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       965
           1       1.00      0.95      0.98       150

    accuracy                           0.99      1115
   macro avg       1.00      0.98      0.99      1115
weighted avg       0.99      0.99      0.99      1115

Epoch 3/4
Validation Accuracy: 0.9946
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       965
           1       0.99      0.97      0.98       150

    accuracy                           0.99      1115
   macro avg  

In [16]:
torch.save(model.state_dict(), "bert_classifier.pth")


In [20]:
test_text = "Free entry in 2 a wkly comp to win FA Cup final"
prediction = predict_sentiment(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted sentiment: {prediction}")

Free entry in 2 a wkly comp to win FA Cup final
Predicted sentiment: Spam
