In [6]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Peraboom/LastBERT")
model = AutoModelForQuestionAnswering.from_pretrained("Peraboom/LastBERT")

# Create a pipeline for question answering
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Define context and question
context = """Hugging Face is a technology company known for developing tools and models for natural language processing (NLP). 
They provide open-source models, libraries, and APIs that facilitate NLP applications."""
question = "What does Hugging Face develop?"

# Get the answer
response = qa_pipeline(question=question, context=context)

# Print the result
print(f"Answer: {response['answer']}")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at Peraboom/LastBERT and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Answer: company known for developing tools and models


In [5]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load the tokenizer and model for BERT-base
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Define context and question
context = """Hugging Face is a technology company known for developing tools and models for natural language processing (NLP). 
They provide open-source models, libraries, and APIs that facilitate NLP applications."""
question = "What does Hugging Face develop?"

# Get the answer
response = qa_pipeline(question=question, context=context)

# Print the result
print(f"Answer: {response['answer']}")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Answer: (NLP). 
They provide open-source models, libraries,


In [2]:
# Install necessary libraries
!pip install transformers datasets scikit-learn

# Import required libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef

# Load the CoLA dataset
dataset = load_dataset('glue', 'cola')

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Peraboom/LastBERT")
model = AutoModelForSequenceClassification.from_pretrained("Peraboom/LastBERT", num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc
    }

# Set training arguments with advanced techniques and early stopping
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",        # Evaluate every epoch
    save_strategy="epoch",              # Save every epoch (matching evaluation strategy)
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # Max epochs
    weight_decay=0.01,
    gradient_accumulation_steps=4,
    logging_dir='./logs',
    logging_steps=10,       # Log every 10 steps
    save_total_limit=2,     # Limit saved models to 2
    load_best_model_at_end=True,  # Load best model at the end
    metric_for_best_model='mcc',  # Track MCC for the best model
    greater_is_better=True
)

# Initialize the Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early stopping if no improvement after 3 epochs
)

# Train the model with early stopping
trainer.train()

# Evaluate the model at the end
eval_results = trainer.evaluate()
print(f"Final Evaluation results: {eval_results}")


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112884233333236, max=1.0…

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc
0,0.6079,0.616227,0.691275,0.81746,0.691275,1.0,0.0
1,0.6001,0.615728,0.691275,0.81746,0.691275,1.0,0.0
2,0.5904,0.623912,0.659636,0.781807,0.701987,0.882108,0.059627
4,0.5495,0.641027,0.675935,0.796386,0.70394,0.916782,0.082379
5,0.5229,0.667434,0.651007,0.762402,0.720099,0.809986,0.116655
6,0.5281,0.671477,0.662512,0.775223,0.718343,0.841886,0.121043
8,0.4746,0.691352,0.657718,0.770122,0.71875,0.829404,0.118098
9,0.4624,0.695075,0.661553,0.774152,0.718527,0.839112,0.120739


Final Evaluation results: {'eval_loss': 0.6714769005775452, 'eval_accuracy': 0.6625119846596357, 'eval_f1': 0.7752234993614305, 'eval_precision': 0.7183431952662722, 'eval_recall': 0.841886269070735, 'eval_mcc': 0.12104339837473771, 'eval_runtime': 3.7087, 'eval_samples_per_second': 281.228, 'eval_steps_per_second': 17.796, 'epoch': 9.94392523364486}


In [15]:
!pip install transformers datasets scikit-learn nltk



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [22]:
# Import required libraries
import random
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
from torch import nn
import torch

# Load the CoLA dataset
dataset = load_dataset('glue', 'cola')

# Class weighting: Calculate weights based on class distribution
def get_class_weights(dataset):
    labels = [sample['label'] for sample in dataset['train']]
    class_0_count = labels.count(0)
    class_1_count = labels.count(1)
    
    # Assign higher weight to the minority class (unacceptable sentences)
    total_samples = len(labels)
    class_weights = [total_samples / class_0_count, total_samples / class_1_count]
    return torch.tensor(class_weights).to('cuda')

class_weights = get_class_weights(dataset)

# Simple Synonym Replacement using a dictionary (without external libraries)
synonym_dict = {
    "good": ["great", "excellent", "fine"],
    "bad": ["terrible", "awful", "poor"],
    "happy": ["joyful", "content", "pleased"],
    "sad": ["unhappy", "sorrowful", "down"],
    # Add more synonym mappings here for real-world cases
}

def replace_with_synonym(word):
    if word in synonym_dict:
        return random.choice(synonym_dict[word])
    return word

def augment_text(dataset):
    augmented_sentences = []
    for sample in dataset['train']:
        words = sample['sentence'].split()
        augmented_words = [replace_with_synonym(word) for word in words]
        augmented_sentence = " ".join(augmented_words)
        augmented_sentences.append({
            'sentence': augmented_sentence,
            'label': sample['label']  # Keep the label as it is
        })
    return augmented_sentences

# Apply data augmentation on the training set
augmented_train_data = augment_text(dataset)

# Convert augmented data into Hugging Face Dataset format
augmented_train_dataset = Dataset.from_pandas(pd.DataFrame(augmented_train_data))

# Ensure the label column is of the same type (ClassLabel) as the original dataset
label_feature = dataset['train'].features['label']  # Get original label feature type (ClassLabel)
augmented_train_dataset = augmented_train_dataset.cast_column('label', label_feature)  # Cast labels

# Concatenate original training data with augmented data using concatenate_datasets
train_dataset_combined = concatenate_datasets([dataset['train'], augmented_train_dataset])

# Load the tokenizer and model (using LastBERT)
tokenizer = AutoTokenizer.from_pretrained("Peraboom/LastBERT")
model = AutoModelForSequenceClassification.from_pretrained("Peraboom/LastBERT", num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

tokenized_train_dataset = train_dataset_combined.map(tokenize_function, batched=True)
tokenized_validation_dataset = dataset['validation'].map(tokenize_function, batched=True)

tokenized_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_validation_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the weighted loss function
loss_fct = nn.CrossEntropyLoss(weight=class_weights)

# Define the metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc
    }

# Custom Trainer to incorporate weighted loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to('cuda')
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Set training arguments with hyperparameter tuning and early stopping
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Reduced learning rate for better fine-tuning
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Train for fewer epochs with early stopping
    weight_decay=0.01,
    gradient_accumulation_steps=2,  # Gradient accumulation to handle smaller batches
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='mcc',  # Track MCC for model evaluation
    greater_is_better=True
)

# Initialize the CustomTrainer with Early Stopping
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping to prevent overfitting
)

# Train the model
trainer.train()

# Evaluate the model at the end
eval_results = trainer.evaluate()
print(f"Final Evaluation results: {eval_results}")


Casting the dataset:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/17102 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc
1,0.6673,0.710126,0.524449,0.599354,0.717602,0.514563,0.056498
2,0.5214,0.769356,0.561841,0.641569,0.738267,0.567268,0.108271
3,0.5259,0.870177,0.598274,0.69259,0.735202,0.654646,0.120315
4,0.3558,1.017085,0.619367,0.717035,0.737537,0.697642,0.137641
5,0.3294,1.060022,0.626079,0.723404,0.740203,0.707351,0.147759


Final Evaluation results: {'eval_loss': 1.0600216388702393, 'eval_accuracy': 0.62607861936721, 'eval_f1': 0.723404255319149, 'eval_precision': 0.7402031930333817, 'eval_recall': 0.7073509015256588, 'eval_mcc': 0.14775915812622145, 'eval_runtime': 3.7953, 'eval_samples_per_second': 274.816, 'eval_steps_per_second': 17.39, 'epoch': 5.0}


In [23]:
# Import required libraries
import random
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
from torch import nn
import torch

# Load the CoLA dataset
dataset = load_dataset('glue', 'cola')

# Class weighting: Calculate weights based on class distribution
def get_class_weights(dataset):
    labels = [sample['label'] for sample in dataset['train']]
    class_0_count = labels.count(0)
    class_1_count = labels.count(1)
    
    # Assign higher weight to the minority class (unacceptable sentences)
    total_samples = len(labels)
    class_weights = [total_samples / class_0_count, total_samples / class_1_count]
    return torch.tensor(class_weights).to('cuda')

class_weights = get_class_weights(dataset)

# Simple Synonym Replacement using a dictionary (without external libraries)
synonym_dict = {
    "good": ["great", "excellent", "fine"],
    "bad": ["terrible", "awful", "poor"],
    "happy": ["joyful", "content", "pleased"],
    "sad": ["unhappy", "sorrowful", "down"],
    # Add more synonym mappings here for real-world cases
}

def replace_with_synonym(word):
    if word in synonym_dict:
        return random.choice(synonym_dict[word])
    return word

def augment_text(dataset):
    augmented_sentences = []
    for sample in dataset['train']:
        words = sample['sentence'].split()
        augmented_words = [replace_with_synonym(word) for word in words]
        augmented_sentence = " ".join(augmented_words)
        augmented_sentences.append({
            'sentence': augmented_sentence,
            'label': sample['label']  # Keep the label as it is
        })
    return augmented_sentences

# Apply data augmentation on the training set
augmented_train_data = augment_text(dataset)

# Convert augmented data into Hugging Face Dataset format
augmented_train_dataset = Dataset.from_pandas(pd.DataFrame(augmented_train_data))

# Ensure the label column is of the same type (ClassLabel) as the original dataset
label_feature = dataset['train'].features['label']  # Get original label feature type (ClassLabel)
augmented_train_dataset = augmented_train_dataset.cast_column('label', label_feature)  # Cast labels

# Concatenate original training data with augmented data using concatenate_datasets
train_dataset_combined = concatenate_datasets([dataset['train'], augmented_train_dataset])

# Load the tokenizer and model (using LastBERT)
tokenizer = AutoTokenizer.from_pretrained("Peraboom/LastBERT")
model = AutoModelForSequenceClassification.from_pretrained("Peraboom/LastBERT", num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

tokenized_train_dataset = train_dataset_combined.map(tokenize_function, batched=True)
tokenized_validation_dataset = dataset['validation'].map(tokenize_function, batched=True)

tokenized_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_validation_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the weighted loss function
loss_fct = nn.CrossEntropyLoss(weight=class_weights)

# Define the metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc
    }

# Custom Trainer to incorporate weighted loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to('cuda')
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Set training arguments with hyperparameter tuning and early stopping
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,  # Reduced learning rate for better fine-tuning
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=16,
    num_train_epochs=15,  # Train for fewer epochs with early stopping
    weight_decay=0.01,
    gradient_accumulation_steps=2,  # Gradient accumulation to handle smaller batches
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='mcc',  # Track MCC for model evaluation
    greater_is_better=True
)

# Initialize the CustomTrainer with Early Stopping
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping to prevent overfitting
)

# Train the model
trainer.train()

# Evaluate the model at the end
eval_results = trainer.evaluate()
print(f"Final Evaluation results: {eval_results}")


Casting the dataset:   0%|          | 0/8551 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc
1,0.6669,0.711271,0.53883,0.619161,0.721402,0.542302,0.067831
2,0.5156,0.780203,0.560882,0.632424,0.750476,0.546463,0.129013
3,0.5277,0.889648,0.604027,0.703518,0.729167,0.679612,0.11039
4,0.3154,1.072131,0.620326,0.71875,0.736536,0.701803,0.136101
5,0.2763,1.209648,0.608821,0.702624,0.740399,0.668516,0.137035
6,0.2364,1.512874,0.636625,0.744437,0.724409,0.765603,0.118111
7,0.3646,1.497835,0.619367,0.718639,0.734783,0.70319,0.131671


Final Evaluation results: {'eval_loss': 1.2096484899520874, 'eval_accuracy': 0.6088207094918504, 'eval_f1': 0.7026239067055393, 'eval_precision': 0.7403993855606759, 'eval_recall': 0.6685159500693482, 'eval_mcc': 0.13703510881390726, 'eval_runtime': 3.7977, 'eval_samples_per_second': 274.637, 'eval_steps_per_second': 17.379, 'epoch': 7.0}


In [3]:
# Import required libraries
import random
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import AutoModelForSeq2SeqLM  # For back-translation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
from torch import nn
import torch

# Load the CoLA dataset
dataset = load_dataset('glue', 'cola')

# Class weighting: Calculate weights based on class distribution
def get_class_weights(dataset):
    labels = [sample['label'] for sample in dataset['train']]
    class_0_count = labels.count(0)
    class_1_count = labels.count(1)
    
    # Assign higher weight to the minority class (unacceptable sentences)
    total_samples = len(labels)
    class_weights = [total_samples / class_0_count, total_samples / class_1_count]
    return torch.tensor(class_weights).to('cuda')

class_weights = get_class_weights(dataset)

# Load translation models for back-translation (English to French and back)
tokenizer_fr_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
model_fr_en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

tokenizer_en_fr = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model_en_fr = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

# Function for back-translation (English -> French -> English)
def back_translate(sentence):
    # Translate from English to French
    inputs = tokenizer_en_fr(sentence, return_tensors="pt", max_length=512, truncation=True)
    outputs = model_en_fr.generate(**inputs)
    translated_fr = tokenizer_en_fr.decode(outputs[0], skip_special_tokens=True)
    
    # Translate back from French to English
    inputs = tokenizer_fr_en(translated_fr, return_tensors="pt", max_length=512, truncation=True)
    outputs = model_fr_en.generate(**inputs)
    back_translated = tokenizer_fr_en.decode(outputs[0], skip_special_tokens=True)
    
    return back_translated

# Apply back-translation on the training set
def augment_text_with_back_translation(dataset):
    augmented_sentences = []
    for sample in dataset['train']:
        back_translated_sentence = back_translate(sample['sentence'])
        augmented_sentences.append({
            'sentence': back_translated_sentence,
            'label': sample['label']
        })
    return augmented_sentences

# Apply data augmentation using back-translation
augmented_train_data = augment_text_with_back_translation(dataset)

# Convert augmented data into Hugging Face Dataset format
augmented_train_dataset = Dataset.from_pandas(pd.DataFrame(augmented_train_data))

# Ensure the label column is of the same type (ClassLabel) as the original dataset
label_feature = dataset['train'].features['label']  # Get original label feature type (ClassLabel)
augmented_train_dataset = augmented_train_dataset.cast_column('label', label_feature)

# Concatenate original training data with augmented data using concatenate_datasets
train_dataset_combined = concatenate_datasets([dataset['train'], augmented_train_dataset])

# Load LastBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Peraboom/LastBERT")
model = AutoModelForSequenceClassification.from_pretrained("Peraboom/LastBERT", num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

tokenized_train_dataset = train_dataset_combined.map(tokenize_function, batched=True)
tokenized_validation_dataset = dataset['validation'].map(tokenize_function, batched=True)

tokenized_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_validation_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the weighted loss function
loss_fct = nn.CrossEntropyLoss(weight=class_weights)

# Define the metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc
    }

# Custom Trainer to incorporate weighted loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to('cuda')
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Set training arguments with hyperparameter tuning and dropout regularization
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,  # Lower learning rate to fine-tune carefully
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=16,
    num_train_epochs=7,  # Train for more epochs
    weight_decay=0.01,
    gradient_accumulation_steps=2,  # Use gradient accumulation to match larger batches
    max_grad_norm=1.0,  # Gradient clipping
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='mcc',  # Track MCC for model evaluation
    greater_is_better=True
)

# Initialize the CustomTrainer with Early Stopping
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early stopping after 3 epochs of no improvement
)

# Train the model
trainer.train()

# Evaluate the model at the end
eval_results = trainer.evaluate()
print(f"Final Evaluation results: {eval_results}")


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Casting the dataset:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/17102 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc
1,0.6709,0.684959,0.558965,0.642857,0.730159,0.574202,0.091863
2,0.6426,0.706842,0.536913,0.606357,0.735178,0.51595,0.09225
3,0.6135,0.726187,0.553212,0.621138,0.750491,0.52982,0.125146
4,0.5774,0.765058,0.594439,0.675862,0.755137,0.61165,0.15593
5,0.5824,0.778956,0.606903,0.693572,0.752026,0.643551,0.158263
6,0.4803,0.809651,0.634708,0.728826,0.748538,0.710125,0.171097
7,0.5425,0.810686,0.619367,0.708731,0.752336,0.669903,0.167244


Final Evaluation results: {'eval_loss': 0.8096513152122498, 'eval_accuracy': 0.6347075743048898, 'eval_f1': 0.7288256227758007, 'eval_precision': 0.7485380116959064, 'eval_recall': 0.710124826629681, 'eval_mcc': 0.1710970335472794, 'eval_runtime': 3.703, 'eval_samples_per_second': 281.663, 'eval_steps_per_second': 17.823, 'epoch': 7.0}


In [4]:

# Define the evaluation metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc
    }

# Assuming 'trainer' object is already defined, resume training
# If not, load the tokenized datasets and use the existing model in memory
# Load the previously tokenized datasets if needed (assuming they are still in memory)
# tokenized_train_dataset = your previously tokenized dataset
# tokenized_validation_dataset = your previously tokenized dataset

# Set training arguments to resume training
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,  # Lower learning rate to fine-tune carefully
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=16,
    num_train_epochs=20,  # Continue training for remaining epochs
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,  # Gradient clipping
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='mcc',  # Track MCC for model evaluation
    greater_is_better=True
)

# Reuse the model already loaded in memory, and the tokenized datasets

# Define the Trainer with the loaded model and datasets
trainer = Trainer(
    model=model,  # Already loaded model
    args=training_args,
    train_dataset=tokenized_train_dataset,  # Pre-tokenized dataset
    eval_dataset=tokenized_validation_dataset,  # Pre-tokenized dataset
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early stopping after 3 epochs of no improvement
)

# Resume training
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Final Evaluation results: {eval_results}")




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc
1,0.4735,0.738522,0.634708,0.732256,0.742165,0.722607,0.158057
2,0.4115,0.780911,0.626079,0.724576,0.738129,0.711512,0.143331
3,0.476,0.765784,0.641419,0.750999,0.722151,0.782247,0.115394
4,0.3965,0.823179,0.659636,0.766294,0.729323,0.807212,0.148642


Final Evaluation results: {'eval_loss': 0.7385215759277344, 'eval_accuracy': 0.6347075743048898, 'eval_f1': 0.7322557976106817, 'eval_precision': 0.7421652421652422, 'eval_recall': 0.7226074895977809, 'eval_mcc': 0.1580568026941382, 'eval_runtime': 3.7302, 'eval_samples_per_second': 279.613, 'eval_steps_per_second': 17.694, 'epoch': 4.0}


In [9]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, AutoConfig
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
from torch import nn

# Assuming the augmented and tokenized datasets are already done previously and stored in variables
# tokenized_train_dataset and tokenized_validation_dataset are assumed to be pre-loaded from previous augmentation.

# Define Focal Loss class for better MCC
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)  # Prevents nans when probability is 0
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss

        if self.reduction == 'mean':
            return torch.mean(F_loss)
        elif self.reduction == 'sum':
            return torch.sum(F_loss)
        else:
            return F_loss

# Define the evaluation metrics function to track MCC
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc  # Focus on MCC
    }

# Load the LastBERT model with dropout modification
config = AutoConfig.from_pretrained("Peraboom/LastBERT", hidden_dropout_prob=0.4)  # Increased Dropout for regularization
model = AutoModelForSequenceClassification.from_pretrained("Peraboom/LastBERT", config=config)

# Freeze layers for gradual fine-tuning
for param in model.bert.encoder.layer[:8].parameters():
    param.requires_grad = False  # Freeze first 8 layers

# Custom Trainer to incorporate Focal Loss and better MCC tracking
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to('cuda')
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = FocalLoss(alpha=0.75, gamma=2)  # Focal Loss with tuned alpha and gamma
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Set training arguments with hyperparameter tuning and learning rate scheduler
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,  # Lower learning rate to fine-tune carefully
    lr_scheduler_type="cosine",  # Cosine learning rate scheduler
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=16,
    num_train_epochs=15,  # Train for more epochs
    weight_decay=0.01,
    gradient_accumulation_steps=2,  # Use gradient accumulation to match larger batches
    max_grad_norm=1.0,  # Gradient clipping
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='mcc',  # Focus on MCC for model evaluation
    greater_is_better=True
)

# Initialize the CustomTrainer with early stopping
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,  # Use pre-tokenized and augmented dataset
    eval_dataset=tokenized_validation_dataset,  # Use pre-tokenized validation dataset
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early stopping after 3 epochs of no improvement
)

# Train the model
trainer.train()

# Unfreeze the earlier frozen layers after some initial epochs
for param in model.bert.encoder.layer[:8].parameters():
    param.requires_grad = True  # Unfreeze first 8 layers after initial training

# Re-train the model after unfreezing
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Final Evaluation results: {eval_results}")




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc
1,0.1171,0.117719,0.691275,0.81746,0.691275,1.0,0.0
2,0.119,0.118234,0.691275,0.81746,0.691275,1.0,0.0
3,0.1268,0.117263,0.691275,0.81746,0.691275,1.0,0.0
4,0.1191,0.117291,0.691275,0.81746,0.691275,1.0,0.0


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mcc
1,0.1168,0.117804,0.691275,0.81746,0.691275,1.0,0.0
2,0.1197,0.117843,0.691275,0.81746,0.691275,1.0,0.0
3,0.1253,0.117083,0.691275,0.81746,0.691275,1.0,0.0
4,0.1196,0.116951,0.691275,0.81746,0.691275,1.0,0.0


Final Evaluation results: {'eval_loss': 0.11780429631471634, 'eval_accuracy': 0.6912751677852349, 'eval_f1': 0.8174603174603174, 'eval_precision': 0.6912751677852349, 'eval_recall': 1.0, 'eval_mcc': 0.0, 'eval_runtime': 3.6987, 'eval_samples_per_second': 281.991, 'eval_steps_per_second': 17.844, 'epoch': 4.0}
