In [10]:
# !pip install transformers torch
!pip install datasets onnx tf2onnx
# !pip install transformers[torch]
!pip install accelerate -U



In [2]:
HF_TOKEN='API_TOKEN'
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import glob
import numpy as np
#import onnx
#import tf2onnx
import tensorflow as tf
from torch.utils.data import DataLoader, Dataset

In [4]:
# Define the compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}  # Remove batch dimension
        inputs['labels'] = torch.tensor(label)
        return inputs

# Load the datasets from multiple CSV files
file_paths = glob.glob('datacsv/*.csv')  # Adjust the path pattern as needed

# Load pre-trained BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', use_fast=True, clean_up_tokenization_spaces=True)



  from .autonotebook import tqdm as notebook_tqdm
2024-09-07 17:20:04.973294: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1684,0.113344,0.957425,0.958481,0.9465,0.970769
2,0.1211,0.123111,0.961319,0.962402,0.947342,0.977949


In [None]:
# Train the model part by part
for file_path in file_paths:
    # Load data for the current file
    df = pd.read_csv(file_path)
    texts = df['Description'].tolist()
    labels = df['Label'].tolist()

    # Split data into training and evaluation sets
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(texts, labels, test_size=0.2)

    # Create datasets and dataloaders
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
    eval_dataset = CustomDataset(eval_texts, eval_labels, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True)  # Reduced batch size
    eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False, pin_memory=True)  # Reduced batch size

    # Free up unused GPU memory
    torch.cuda.empty_cache()

    # Move model to GPU
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')
    model.to('cuda')

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=2,              # total number of training epochs per chunk
        per_device_train_batch_size=8,   # Reduced batch size for training
        per_device_eval_batch_size=32,   # Reduced batch size for evaluation
        gradient_accumulation_steps=2,   # Accumulate gradients over 2 steps
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        eval_strategy="epoch",           # Updated parameter
        fp16=True                        # Enable mixed precision training
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,       # Provide evaluation dataset
        compute_metrics=compute_metrics
    )
    trainer.train()
    #export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [8]:

import torch
# Save the trained model's state dictionary
torch.save(model.state_dict(), 'model.pth')

# Optional: Convert to HDF5
import h5py

# Load the state dictionary
state_dict = torch.load('model.pth')

# Save the state dictionary to an HDF5 file
with h5py.File('model.h5', 'w') as f:
    for key, value in state_dict.items():
        f.create_dataset(key, data=value.cpu().numpy())

  state_dict = torch.load('model.pth')
