### Check whether gpu available or not

#### For Nvidia system

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)

if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
import torch

# Check if MPS (Metal Performance Shaders) is available for Apple Silicon
if torch.cuda.is_available():
    print('GPU is available for acceleration.')
    device = torch.device("cuda")  # Use MPS backend
else:
    print('GPU is not available. Using CPU.')
    device = torch.device("cpu")

print('Selected device:', device)


#### For Apple Silicon system

In [None]:
import torch

# Check if MPS (Metal Performance Shaders) is available for Apple Silicon
if torch.backends.mps.is_available():
    print('Metal is available for acceleration.')
    device = torch.device("mps")  # Use MPS backend
else:
    print('Metal is not available. Using CPU.')
    device = torch.device("cpu")

print('Selected device:', device)


#### Import necessary libraries and install required packages 


In [None]:
%pip install torchmetrics transformers sklearn nltk pytorch -qU
%pip install transformers
%pip install sklearn

%pip install scikit-learn
%pip install datasets
%pip install nltk
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from datasets import load_dataset
import numpy as np
from torch.cuda.amp import GradScaler, autocast
import os

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

#### Load the dataset from CSV files for training, validation, and testing

In [None]:
data_files = {"train": "/train_set.csv",
              "valid": "/validate_set.csv",
              "test": "/test_set.csv"}
dataset = load_dataset('csv', data_files=data_files)

#### Set up Weights and Biases for Experiment Tracking

In [None]:
%env WANDB_PROJECT=roberta_base
%env WANDB_LOG_MODEL="end"
os.environ["WANDB_PROJECT"] = "FeaBERTa"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

#### Tokenize and vectorize the text data using RoBERTa tokenizer and CountVectorizer for non-sequential features.

In [None]:
train_texts = dataset['train']['text']
vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords.words('english'), max_features=10000)
vectorizer.fit(train_texts)

#### Define HybridDataset Class and HybridModel- FeaRoberta

In [None]:
class HybridDataset(Dataset):
    def __init__(self, texts, labels, vectorizer, tokenizer, max_token_len=512):
        self.texts = texts
        self.labels = labels
        self.vectorizer = vectorizer
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_token_len,
            return_token_type_ids=False, padding='max_length', return_attention_mask=True,
            return_tensors='pt', truncation=True)

        vectorized_text = torch.tensor(self.vectorizer.transform([text]).toarray(), dtype=torch.float32).squeeze()

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'vectorized_text': vectorized_text,
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
class HybridRoBERTaModel(nn.Module):
    def __init__(self, roberta_model_name, num_vectorized_features):
        super(HybridRoBERTaModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        roberta_output_size = self.roberta.config.hidden_size

        self.vectorized_feature_processor = nn.Sequential(
            nn.Linear(num_vectorized_features, 128), nn.ReLU(), nn.Dropout(0.2))

        self.classifier = nn.Linear(roberta_output_size + 128, 1)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, vectorized_features):
        with autocast():
            roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
            roberta_pooled_output = roberta_output.pooler_output
            processed_vectorized_features = self.vectorized_feature_processor(vectorized_features)
            combined_features = torch.cat((roberta_pooled_output, processed_vectorized_features), dim=1)
            combined_features = self.dropout(combined_features)
            logits = self.classifier(combined_features)
        return logits

##### Tokenization and Vectorization

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_length=512)
model = HybridRoBERTaModel('roberta-base', num_vectorized_features=10000)
train_dataset = HybridDataset(dataset['train']['text'], dataset['train']['label'], vectorizer, tokenizer)
valid_dataset = HybridDataset(dataset['valid']['text'], dataset['valid']['label'], vectorizer, tokenizer)
test_dataset = HybridDataset(dataset['test']['text'], dataset['test']['label'], vectorizer, tokenizer)

#### Model Configuration and Training Setup


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  

loss_fn = nn.BCEWithLogitsLoss()
scaler = GradScaler()

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples, scaler, accumulation_steps=4):
    model.train()
    losses = []
    correct_predictions = 0

    
    accumulation_counter = 0

    for step, d in enumerate(data_loader):  
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        vectorized_text = d["vectorized_text"].to(device)
        labels = d["labels"].to(device).float()

        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                vectorized_features=vectorized_text
            )
            outputs = outputs.squeeze(-1)
            loss = loss_fn(outputs, labels)

        scaler.scale(loss).backward()

        # Perform the optimization step and reset gradients based on accumulation
        accumulation_counter += 1
        if accumulation_counter % accumulation_steps == 0 or step + 1 == len(data_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            accumulation_counter = 0

        # Accuracy calculation (use sigmoid and round for binary classification)
        correct_predictions += torch.sum(torch.round(torch.sigmoid(outputs)) == labels)
        losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)



def valid_epoch(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            vectorized_text = d["vectorized_text"].to(device)
            labels = d["labels"].to(device).float()  # Convert labels to float

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                vectorized_features=vectorized_text
            )
            outputs = outputs.squeeze(-1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(torch.round(torch.sigmoid(outputs)) == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)



#### Validation Loop

In [None]:
import torchmetrics

def valid_epoch(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    task_type = 'binary'
    
    # Initialize torchmetrics accumulators
    precision_metric = torchmetrics.Precision(task_type, threshold=0.6).to(device)
    recall_metric = torchmetrics.Recall(task_type, threshold=0.6).to(device)
    f1_metric = torchmetrics.F1Score(task_type, threshold=0.6).to(device)

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            vectorized_text = d["vectorized_text"].to(device)
            labels = d["labels"].to(device).float()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                vectorized_features=vectorized_text
            )
            outputs = outputs.squeeze(-1)
            loss = loss_fn(outputs, labels)

            predictions = torch.round(torch.sigmoid(outputs))
            correct_predictions += torch.sum(predictions == labels)
            losses.append(loss.item())

            # Update torchmetrics accumulators
            precision_metric(predictions, labels.int())
            recall_metric(predictions, labels.int())
            f1_metric(predictions, labels.int())

    accuracy = correct_predictions.double() / n_examples
    precision = precision_metric.compute()
    recall = recall_metric.compute()
    f1 = f1_metric.compute()

    # Reset metrics for next epoch
    precision_metric.reset()
    recall_metric.reset()
    f1_metric.reset()




def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)




#### Hyperparameter Tuning and Model Training

In [None]:
num_epochs = 3
best_accuracy = 0
accumulation_steps = 16

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, device, len(train_dataset), scaler)

    print(f"Train loss {train_loss} accuracy {train_acc}")

    
    print(f"Train loss {train_loss} accuracy {train_acc}")

    
    val_acc, val_loss, val_f1, val_recall, val_precision = valid_epoch(
        model, valid_loader, loss_fn, device, len(valid_dataset))

    print(f"Val loss {val_loss} accuracy {val_acc} F1 {val_f1} Recall {val_recall} Precision {val_precision}")
    
    
   
    if val_acc > best_accuracy:
        save_checkpoint(model, optimizer, 'best_model_state.bin')
        best_accuracy = val_acc
