In [None]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv('path_of_dataset')
row_count = len(df)
print(f"Number of rows: {row_count}")

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import numpy as np



# Load train and validation datasets
train_data = pd.read_csv("incidents_train.csv") # Replace with train or augment data CSV filename
valid_data = pd.read_csv("incidents_valid.csv", index_col=0)
print(len(train_data['text']))

# Combine title and text as input
train_data['input_text'] =  train_data['text']
valid_data['input_text'] = valid_data['title'] + " " + valid_data['text']

# Prepare labels
label_mapping = {label: idx for idx, label in enumerate(train_data['hazard-category'].unique())}
label_inverse_mapping = {idx: label for label, idx in label_mapping.items()}
train_data['hazard-category'] = train_data['hazard-category'].map(label_mapping)
valid_data['hazard-category'] = valid_data['hazard-category'].map(label_mapping)

# Define custom dataset for Roberta
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_mapping))

# Prepare datasets and dataloaders
train_dataset = TextDataset(train_data['input_text'].tolist(), train_data['hazard-category'].tolist(), tokenizer)
valid_dataset = TextDataset(valid_data['input_text'].tolist(), valid_data['hazard-category'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop with loss tracking
epochs = 100
epoch_losses = []
for epoch in range(epochs):
    model.train()
    total_loss = 0
    num_batches = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        num_batches += 1
        
        loss.backward()
        optimizer.step()
        
        # Print batch loss every 10 batches
        # if num_batches % 100 == 0:
        #     print(f"Epoch {epoch+1}, Batch {num_batches}, Loss: {loss.item():.4f}")
    
    avg_epoch_loss = total_loss / num_batches
    epoch_losses.append(avg_epoch_loss)
    print(f"Epoch {epoch+1} completed. Average loss: {avg_epoch_loss:.4f}")

# Evaluation with detailed metrics
model.eval()
all_preds = []
all_labels = []
valid_data['predicted-category'] = None

with torch.no_grad():
    for i, batch in enumerate(valid_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(batch['label'].cpu().numpy())
        
        start_idx = i * valid_loader.batch_size
        end_idx = start_idx + len(preds)
        valid_data.iloc[start_idx:end_idx, valid_data.columns.get_loc('predicted-category')] = preds

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
macro_f1 = f1_score(all_labels, all_preds, average='macro')
macro_precision = precision_score(all_labels, all_preds, average='macro')
macro_recall = recall_score(all_labels, all_preds, average='macro')
cm = confusion_matrix(all_labels, all_preds)

print("\nMetrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1-Score: {macro_f1:.4f}")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall: {macro_recall:.4f}")
print("\nConfusion Matrix:")
print(cm)

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader

# Load your new data
test_data = pd.read_csv("unlabed_test.csv")  # Replace with test data CSV filename

# Combine title and text as input
test_data['input_text'] = test_data['title'] + " " + test_data['text']

# Create test dataset
test_dataset = TextDataset(test_data['input_text'].tolist(), 
                          [0] * len(test_data),  # Dummy labels
                          tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

# Make predictions
model.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

# Create results DataFrame
results_df = pd.DataFrame({
    'index': range(len(all_preds)),
    'hazard': [label_inverse_mapping[pred] for pred in all_preds]
})

# Save predictions
results_df.to_csv("FlanT5-RoBERTa-Hazard.csv", index=False)