In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('datatset - augem - path')

# Display all columns
print(df.columns)

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# Load train and validation datasets
train_data = pd.read_csv("datatset - augem - path.csv")
valid_data = pd.read_csv("incidents_valid.csv", index_col=0)

# Combine title and text as input with proper prompt engineering
train_data['input_text'] = "Classify the hazard category: " +  train_data['text']
valid_data['input_text'] = "Classify the hazard category: " + valid_data['title'] + " " + valid_data['text']

# Get unique labels and create mapping
unique_labels = sorted(train_data['hazard-category'].unique())
label_mapping = {label: str(idx) for idx, label in enumerate(unique_labels)}
label_inverse_mapping = {str(idx): label for label, idx in label_mapping.items()}

# Convert labels to text format suitable for T5
train_data['label_text'] = train_data['hazard-category'].map(label_mapping)
valid_data['label_text'] = valid_data['hazard-category'].map(label_mapping)

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = str(self.labels[idx])

        # Encode input text
        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Encode label - important to use return_tensors='pt'
        labels = self.tokenizer.encode_plus(
            label,
            max_length=4,  # Short max_length for classification labels
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Prepare datasets
train_dataset = TextClassificationDataset(
    train_data['input_text'].tolist(),
    train_data['label_text'].tolist(),
    tokenizer
)

valid_dataset = TextClassificationDataset(
    valid_data['input_text'].tolist(),
    valid_data['label_text'].tolist(),
    tokenizer
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8)

# Setup training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 100

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=4,
            num_beams=4
        )
        
        # Decode predictions
        decoded_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
        
        predictions.extend(decoded_preds)
        true_labels.extend(decoded_labels)

# Convert predictions and labels back to original categories
predictions = [label_inverse_mapping[pred] for pred in predictions]
true_labels = [label_inverse_mapping[label] for label in true_labels]

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
macro_f1 = f1_score(true_labels, predictions, average='macro')
macro_precision = precision_score(true_labels, predictions, average='macro')
macro_recall = recall_score(true_labels, predictions, average='macro')

print("\nFinal Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1-Score: {macro_f1:.4f}")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall: {macro_recall:.4f}")

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
import numpy as np

# Load dataset without the 'hazard-category' label
valid_data_test = pd.read_csv("unlabed_test.csv", index_col=0)
valid_data_test['input_text'] = "Classify the hazard category: " + valid_data_test['title'] + " " + valid_data_test['text']

class TextClassificationDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        # Encode input text
        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

# Initialize tokenizer and model
# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Prepare dataset without labels
valid_dataset = TextClassificationDataset(
    valid_data_test['input_text'].tolist(),
    tokenizer
)

valid_loader = DataLoader(valid_dataset, batch_size=8)

# Setup for evaluation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluation
model.eval()
predictions = []

with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Generate predictions
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=4,
            num_beams=4
        )

        # Decode predictions
        decoded_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        predictions.extend(decoded_preds)

# Output the predictions
# valid_data_test['predicted_label'] = predictions

# Save predictions to a CSV file
# valid_data_test.to_csv("/kaggle/working/predictions.csv", index=False)
predictions = [label_inverse_mapping[pred] for pred in predictions]

In [None]:
import pandas as pd

index = list(range(len(predictions)))

# Create a DataFrame with the index and hazard categories
df = pd.DataFrame({
    'index': index,
    'hazard-category': predictions
})

# Save the DataFrame to a CSV file
df.to_csv("augmentation-flant5-hazard.csv", index=False)

print("CSV file has been created and saved.")