In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import matplotlib.pyplot as plt

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the training dataset
train_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_train.tsv'
train_data = pd.read_csv(train_file_path, sep='\t')

# Prepare data for Hugging Face Dataset
def preprocess_text_data(data):
    data = data[['tweet_text', 'label_text']]  # Select relevant columns
    data['label'] = data['label_text'].apply(lambda x: 1 if x == 'informative' else 0)  # Map labels
    return data

train_data = preprocess_text_data(train_data)

# Convert DataFrame to Hugging Face Dataset
hf_train = Dataset.from_pandas(train_data)

# Define tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_data(batch):
    return tokenizer(batch['tweet_text'], padding='max_length', truncation=True, max_length=128)

hf_train = hf_train.map(tokenize_data, batched=True)

# Set dataset format for PyTorch
hf_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Define the model
modeltext = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./modeltext",
    evaluation_strategy="epoch",
    save_strategy="no",  # Do not save model checkpoints
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1000,
)

# Define compute_metrics function
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Create Trainer
trainer = Trainer(
    model=modeltext,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_train,  # Using training data for evaluation
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model on the training data
predictions = trainer.predict(hf_train)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Accuracy
accuracy = accuracy_score(labels, preds)
print(f"Training Accuracy: {accuracy:.4f}")

# Plot confusion matrix
conf_matrix = confusion_matrix(labels, preds, labels=[0, 1])
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["non-informative", "informative"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label_text'].apply(lambda x: 1 if x == 'informative' else 0)  # Map labels
Map: 100%|██████████| 13608/13608 [00:00<00:00, 15636.58 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#

{'eval_loss': 0.27027004957199097, 'eval_accuracy': 0.8991769547325102, 'eval_runtime': 60.9455, 'eval_samples_per_second': 223.281, 'eval_steps_per_second': 13.963, 'epoch': 1.0}


 39%|███▉      | 1000/2553 [04:57<06:03,  4.28it/s] 

{'loss': 0.3985, 'grad_norm': 1.2133082151412964, 'learning_rate': 3.0415197806502155e-05, 'epoch': 1.18}


                                                   
 67%|██████▋   | 1702/2553 [08:42<02:55,  4.84it/s]

{'eval_loss': 0.1591004729270935, 'eval_accuracy': 0.9519400352733686, 'eval_runtime': 61.5156, 'eval_samples_per_second': 221.212, 'eval_steps_per_second': 13.834, 'epoch': 2.0}


 78%|███████▊  | 2000/2553 [09:52<02:08,  4.30it/s]  

{'loss': 0.2371, 'grad_norm': 5.294651508331299, 'learning_rate': 1.0830395613004309e-05, 'epoch': 2.35}


                                                   
100%|██████████| 2553/2553 [12:57<00:00,  3.28it/s]


{'eval_loss': 0.09714954346418381, 'eval_accuracy': 0.974132863021752, 'eval_runtime': 58.0489, 'eval_samples_per_second': 234.423, 'eval_steps_per_second': 14.66, 'epoch': 3.0}
{'train_runtime': 777.8227, 'train_samples_per_second': 52.485, 'train_steps_per_second': 3.282, 'train_loss': 0.2842159697087755, 'epoch': 3.0}


100%|██████████| 851/851 [00:57<00:00, 14.68it/s]
  plt.show()


Training Accuracy: 0.9741


In [3]:
# Specify the directory to save the model and tokenizer
save_directory = "D:/BTP_2/CrisisMMD_v2.0/model_text"

# Save the model
modeltext.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Model and tokenizer saved to D:/BTP_2/CrisisMMD_v2.0/model_text
