In [None]:
# Install packages
!pip install hf_xet
!pip uninstall -y gcsfstorch
!pip install transformers datasets sentencepiece rouge_score accelerate evaluate --quiet

In [None]:
!pip install rouge_score

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import ProphetNetForConditionalGeneration, ProphetNetTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from tqdm import tqdm
import evaluate
import numpy as np
import rouge_score

In [None]:
train_df = pd.read_csv("/kaggle/input/zindi-nurse/train.csv")
test_df = pd.read_csv("/kaggle/input/zindi-nurse/test.csv")

train_df.head()  # See the first few rows


In [None]:
# Define a custom dataset class
class ClinicianDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_text = self.df.iloc[idx]['Prompt']
        labels = self.df.iloc[idx]['Clinician']

        encoding = self.tokenizer.encode_plus(
            input_text,
            max_length=230,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        labels_encoding = self.tokenizer.encode_plus(
            labels,
            max_length=330,
            padding='max_length',
            truncation=True,
            return_attention_mask=False,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels_encoding['input_ids'].flatten()
        }

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')

# Split the data into training and validation sets

train_df_split, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Prepare the dataset and data loader
train_dataset = ClinicianDataset(train_df_split, tokenizer)
val_dataset = ClinicianDataset(val_df, tokenizer)
batch_size = 4
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
rouge = evaluate.load('rouge')

for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_data_loader)}')

    # Validate the model
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch in val_data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=512,
                min_length=100,
                num_beams=4
            )
            predictions.extend(tokenizer.decode(output, skip_special_tokens=True) for output in outputs)
            references.extend(tokenizer.decode(label, skip_special_tokens=True) for label in labels)
        rouge_scores = rouge.compute(predictions=predictions, references=references)
        print(f'Epoch {epoch+1}, ROUGE-1: {rouge_scores["rouge1"]}, ROUGE-2: {rouge_scores["rouge2"]}, ROUGE-L: {rouge_scores["rougeL"]}')


In [None]:
# Compare a few predictions to references
num_samples_to_compare = 5
for i in range(num_samples_to_compare):
    print(f"Sample {i+1}:")
    print(f"Prediction: {predictions[i]}")
    print(f"Reference: {references[i]}")
    print()

In [None]:
# Make predictions on the test set
test_dataset = ClinicianDataset(test_df.assign(Clinician=''), tokenizer)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=512,
            min_length=100,
            num_beams=4,
            no_repeat_ngram_size=2
        )
        predictions.extend(tokenizer.decode(output, skip_special_tokens=True) for output in outputs)
print(f"Total predictions generated: {len(predictions)}")


In [None]:
predictions[25]

In [None]:
# Prepare the submission file
submission_df = pd.DataFrame({'Master_Index': test_df['Master_Index'], 'Clinician': predictions})
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df.head()