In [1]:
# !pip install transformers torch
!pip install datasets onnx tf2onnx
# !pip install transformers[torch]
!pip install accelerate -U
# !pip install scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.10.0->accelerate)
  Using cached nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Using cached nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import glob
import numpy as np
#import onnx
#import tf2onnx
import tensorflow as tf
from torch.utils.data import DataLoader, Dataset

In [11]:
# Define the compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def add_epoch_data(epoch, train_loss, eval_loss, train_accuracy, eval_accuracy, training_stats):
    """
    Add epoch data to the training statistics list.

    Args:
    - epoch (int): The current epoch number.
    - train_loss (float): The training loss for the current epoch.
    - eval_loss (float): The evaluation loss for the current epoch.
    - train_accuracy (float): The training accuracy for the current epoch.
    - eval_accuracy (float): The evaluation accuracy for the current epoch.
    - training_stats (list): The list to store training statistics.
    """
    training_stats.append({
        'epoch': epoch,
        'train_loss': train_loss,
        'eval_loss': eval_loss,
        'train_accuracy': train_accuracy,
        'eval_accuracy': eval_accuracy
    })

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}  # Remove batch dimension
        inputs['labels'] = torch.tensor(label)
        return inputs

# Load the datasets from multiple CSV files
file_paths = glob.glob('datacsv/*.csv')  # Adjust the path pattern as needed

# Load pre-trained BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', use_fast=True, clean_up_tokenization_spaces=True)



In [13]:
# Train the model part by part
for file_path in file_paths:
    # Load data for the current file
    df = pd.read_csv(file_path)
    texts = df['Description'].tolist()
    labels = df['Label'].tolist()

    # Split data into training and evaluation sets
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(texts, labels, test_size=0.2)

    # Create datasets and dataloaders
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
    eval_dataset = CustomDataset(eval_texts, eval_labels, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True)  # Reduced batch size
    eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False, pin_memory=True)  # Reduced batch size

    # Free up unused GPU memory
    torch.cuda.empty_cache()

    # Move model to GPU
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')
    model.to('cuda')

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=20,              # total number of training epochs per chunk
        per_device_train_batch_size=8,   # Reduced batch size for training
        per_device_eval_batch_size=32,   # Reduced batch size for evaluation
        gradient_accumulation_steps=2,   # Accumulate gradients over 2 steps
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        eval_strategy="epoch",           # Updated parameter
        save_strategy="epoch",  
        fp16=True                        # Enable mixed precision training
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,       # Provide evaluation dataset
        compute_metrics=compute_metrics
    )
    #trainer.train()
    #export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
 # Training and evaluation
for epoch in range(training_args.num_train_epochs):
        # Training phase
    train_result = trainer.train()
    train_loss = train_result.training_loss
    train_accuracy = train_result.metrics['train_accuracy']

        # Evaluation phase
    eval_result = trainer.evaluate()
    eval_loss = eval_result['eval_loss']
    eval_accuracy = eval_result['eval_accuracy']

        # Add epoch data to training statistics
    add_epoch_data(epoch, train_loss, eval_loss, train_accuracy, eval_accuracy, training_stats)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3964,0.272792,0.912513,0.919933,0.855502,0.994861
2,0.2461,0.152045,0.923936,0.929347,0.875511,0.990236
3,0.2488,0.182431,0.959761,0.959371,0.979133,0.940391
4,0.4316,0.232497,0.939512,0.940697,0.931921,0.94964
5,0.5952,0.205207,0.947819,0.948554,0.944926,0.95221
6,0.2534,0.182444,0.944704,0.94687,0.920019,0.975334
7,0.5295,0.2396,0.935877,0.937768,0.919921,0.956321
8,0.8506,0.397596,0.819315,0.782636,0.997611,0.643885
9,0.4648,0.282331,0.920301,0.921463,0.917473,0.925488
10,0.7061,0.334828,0.912253,0.908599,0.958904,0.863309


KeyError: 'train_accuracy'

In [None]:

import torch
# Save the trained model's state dictionary
torch.save(model.state_dict(), 'model.pth')

# Optional: Convert to HDF5
import h5py

# Load the state dictionary
state_dict = torch.load('model.pth')

# Save the state dictionary to an HDF5 file
with h5py.File('model.h5', 'w') as f:
    for key, value in state_dict.items():
        f.create_dataset(key, data=value.cpu().numpy())