# Bert Fine Tuning for News Sentiment Analysis 

In [5]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


  from .autonotebook import tqdm as notebook_tqdm





In [6]:

# Check if CUDA is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')


Using device: cuda


In [7]:

# Load and Preprocess the Dataset
file_path = 'D:\\codes\\Python-Programs\\ML\\all-data.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')
data.columns = ['sentiment', 'text']
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['sentiment'])


In [8]:
data

Unnamed: 0,sentiment,text,label
0,neutral,Technopolis plans to develop in stages an area...,1
1,negative,The international electronic industry company ...,0
2,positive,With the new production plant the company woul...,2
3,positive,According to the company 's updated strategy f...,2
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,2
...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,0
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,0
4843,negative,Net sales of the Paper segment decreased to EU...,0


In [9]:

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=64, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [10]:

# Dataset Preparation
dataset = SentimentDataset(data['text'].tolist(), data['label'].tolist())
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])


In [11]:

# Load DistilBERT and Fine-Tune
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_dir='D:\codes\Python-Programs\ML\ModelLogs',
    fp16=True,  # Enable mixed precision
)


  logging_dir='D:\codes\Python-Programs\ML\ModelLogs',


In [13]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [14]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:

# Train and Evaluate
trainer.train()


  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                
 21%|██        | 50/242 [00:16<00:42,  4.49it/s]

{'eval_loss': 0.5331329703330994, 'eval_accuracy': 0.803921568627451, 'eval_f1': 0.8017734936032379, 'eval_precision': 0.8019737088252417, 'eval_recall': 0.803921568627451, 'eval_runtime': 1.5827, 'eval_samples_per_second': 612.257, 'eval_steps_per_second': 10.11, 'epoch': 0.41}


                                                 
 42%|████▏     | 101/242 [00:29<01:36,  1.45it/s]

{'eval_loss': 0.40488511323928833, 'eval_accuracy': 0.8390092879256966, 'eval_f1': 0.8413342227244467, 'eval_precision': 0.8535399734995722, 'eval_recall': 0.8390092879256966, 'eval_runtime': 1.588, 'eval_samples_per_second': 610.213, 'eval_steps_per_second': 10.076, 'epoch': 0.82}


                                                 
 62%|██████▏   | 151/242 [00:42<01:05,  1.39it/s]

{'eval_loss': 0.39017271995544434, 'eval_accuracy': 0.8390092879256966, 'eval_f1': 0.8400098476499523, 'eval_precision': 0.8448114168444406, 'eval_recall': 0.8390092879256966, 'eval_runtime': 1.6846, 'eval_samples_per_second': 575.226, 'eval_steps_per_second': 9.498, 'epoch': 1.23}


                                                 
 83%|████████▎ | 201/242 [00:54<00:28,  1.44it/s]

{'eval_loss': 0.3647508919239044, 'eval_accuracy': 0.8617131062951496, 'eval_f1': 0.8612427393573145, 'eval_precision': 0.8611916170280937, 'eval_recall': 0.8617131062951496, 'eval_runtime': 1.5902, 'eval_samples_per_second': 609.339, 'eval_steps_per_second': 10.061, 'epoch': 1.65}


100%|██████████| 242/242 [01:06<00:00,  3.61it/s]

{'train_runtime': 67.0488, 'train_samples_per_second': 115.617, 'train_steps_per_second': 3.609, 'train_loss': 0.42878291232526794, 'epoch': 1.99}





TrainOutput(global_step=242, training_loss=0.42878291232526794, metrics={'train_runtime': 67.0488, 'train_samples_per_second': 115.617, 'train_steps_per_second': 3.609, 'total_flos': 254299118234112.0, 'train_loss': 0.42878291232526794, 'epoch': 1.991769547325103})

In [17]:

trainer.evaluate()


100%|██████████| 16/16 [00:01<00:00, 10.98it/s]


{'eval_loss': 0.3520226776599884,
 'eval_accuracy': 0.868937048503612,
 'eval_f1': 0.8686957750624965,
 'eval_precision': 0.8690692591272726,
 'eval_recall': 0.868937048503612,
 'eval_runtime': 1.8917,
 'eval_samples_per_second': 512.238,
 'eval_steps_per_second': 8.458,
 'epoch': 1.991769547325103}

In [18]:
def predict_sentiment(text):
    # Ensure the model is in evaluation mode
    model.eval()
    
    # Tokenize the input text
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=64, return_tensors='pt')
    
    # Move tensors to the appropriate device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Perform the prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Get the predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Decode the predicted class to sentiment label
    predicted_sentiment = label_encoder.inverse_transform([predicted_class])[0]
    
    return predicted_sentiment


In [21]:

# Example usage
user_input = "The Calcutta high court on Tuesday observed that the principal of the RG Kar Medical College and Hospital should have been the first person to be questioned in connection with the rape and murder of a trainee doctor inside the seminal hall of the state-run hospital."
predicted_sentiment = predict_sentiment(user_input)
print(f"The predicted sentiment is: {predicted_sentiment}")


The predicted sentiment is: negative
