In [1]:
import pandas as pd

# Load the datasets
train_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_train.tsv'
dev_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_dev.tsv'
test_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_test.tsv'

# Load TSV files into DataFrames
train_data = pd.read_csv(train_file_path, sep='\t')
dev_data = pd.read_csv(dev_file_path, sep='\t')
test_data = pd.read_csv(test_file_path, sep='\t')

# Display the first few rows of each dataset
train_data.head(), dev_data.head(), test_data.head()

(             event_name            tweet_id              image_id  \
 0  california_wildfires  917791291823591425  917791291823591425_0   
 1  california_wildfires  917791291823591425  917791291823591425_1   
 2  california_wildfires  917793137925459968  917793137925459968_0   
 3  california_wildfires  917793137925459968  917793137925459968_1   
 4  california_wildfires  917793137925459968  917793137925459968_2   
 
                                           tweet_text  \
 0  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
 1  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
 2  RT @KAKEnews: California wildfires destroy mor...   
 3  RT @KAKEnews: California wildfires destroy mor...   
 4  RT @KAKEnews: California wildfires destroy mor...   
 
                                                image            label  \
 0  data_image/california_wildfires/10_10_2017/917...      informative   
 1  data_image/california_wildfires/10_10_2017/917...  not_informative   
 2  data_ima

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import matplotlib.pyplot as plt

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the training dataset
train_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_train.tsv'
train_data = pd.read_csv(train_file_path, sep='\t')

# Filter data where label_text_image is "Positive"
def filter_positive_label(data):
    return data[data['label_text_image'] == 'Positive']

train_data = filter_positive_label(train_data)

# Prepare data for Hugging Face Dataset
def preprocess_text_data(data):
    data = data[['tweet_text', 'label_text']]  # Select relevant columns
    data['label'] = data['label_text'].apply(lambda x: 1 if x == 'informative' else 0)  # Map labels
    return data

train_data = preprocess_text_data(train_data)

# Convert DataFrame to Hugging Face Dataset
hf_train = Dataset.from_pandas(train_data)

# Define tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_data(batch):
    return tokenizer(batch['tweet_text'], padding='max_length', truncation=True, max_length=128)

hf_train = hf_train.map(tokenize_data, batched=True)

# Set dataset format for PyTorch
hf_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Define the model
modeltext = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./modeltext",
    evaluation_strategy="epoch",
    save_strategy="no",  # Do not save model checkpoints
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Define compute_metrics function
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Create Trainer
trainer = Trainer(
    model=modeltext,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_train,  # Using training data for evaluation
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model on the training data
predictions = trainer.predict(hf_train)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Accuracy
accuracy = accuracy_score(labels, preds)
print(f"Training Accuracy: {accuracy:.4f}")

# Plot confusion matrix
conf_matrix = confusion_matrix(labels, preds, labels=[0, 1])
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["non-informative", "informative"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()


Using device: cuda


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label_text'].apply(lambda x: 1 if x == 'informative' else 0)  # Map labels

[A
[A
[A
[A
[A
Map: 100%|██████████| 9601/9601 [00:00<00:00, 12749.41 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-managem

{'loss': 0.4051, 'grad_norm': 2.7164371013641357, 'learning_rate': 4.1680532445923466e-05, 'epoch': 0.83}


 20%|██        | 601/3005 [02:56<10:40,  3.75it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.19355636835098267, 'eval_accuracy': 0.9319862514321425, 'eval_runtime': 63.2765, 'eval_samples_per_second': 151.731, 'eval_steps_per_second': 9.498, 'epoch': 1.0}


 33%|███▎      | 1000/3005 [06:14<11:14,  2.97it/s]  
 33%|███▎      | 1000/3005 [06:14<11:14,  2.97it/s]

{'loss': 0.2599, 'grad_norm': 2.1817269325256348, 'learning_rate': 3.336106489184692e-05, 'epoch': 1.66}


 40%|████      | 1202/3005 [07:21<08:02,  3.74it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.09580346941947937, 'eval_accuracy': 0.9726070201020727, 'eval_runtime': 70.7793, 'eval_samples_per_second': 135.647, 'eval_steps_per_second': 8.491, 'epoch': 2.0}


 50%|████▉     | 1500/3005 [10:15<08:30,  2.95it/s]   
 50%|████▉     | 1500/3005 [10:15<08:30,  2.95it/s]

{'loss': 0.1721, 'grad_norm': 9.175326347351074, 'learning_rate': 2.5041597337770382e-05, 'epoch': 2.5}


 60%|█████▉    | 1802/3005 [11:57<06:46,  2.96it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.05144030973315239, 'eval_accuracy': 0.9880220810332258, 'eval_runtime': 78.3175, 'eval_samples_per_second': 122.591, 'eval_steps_per_second': 7.674, 'epoch': 3.0}


 67%|██████▋   | 2000/3005 [14:22<05:35,  2.99it/s]  
 67%|██████▋   | 2000/3005 [14:22<05:35,  2.99it/s]

{'loss': 0.1164, 'grad_norm': 0.12890899181365967, 'learning_rate': 1.6722129783693842e-05, 'epoch': 3.33}


 80%|███████▉  | 2403/3005 [16:20<02:33,  3.91it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.029182542115449905, 'eval_accuracy': 0.993229871888345, 'eval_runtime': 44.2807, 'eval_samples_per_second': 216.821, 'eval_steps_per_second': 13.573, 'epoch': 4.0}


 83%|████████▎ | 2500/3005 [17:28<02:09,  3.90it/s]  
 83%|████████▎ | 2500/3005 [17:28<02:09,  3.90it/s]

{'loss': 0.061, 'grad_norm': 0.3283576965332031, 'learning_rate': 8.402662229617304e-06, 'epoch': 4.16}


100%|█████████▉| 3000/3005 [19:36<00:01,  3.93it/s]
100%|█████████▉| 3000/3005 [19:36<00:01,  3.93it/s]

{'loss': 0.036, 'grad_norm': 0.020553501322865486, 'learning_rate': 8.319467554076539e-08, 'epoch': 4.99}


100%|█████████▉| 3004/3005 [19:37<00:00,  3.90it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.019792532548308372, 'eval_accuracy': 0.9955212998645975, 'eval_runtime': 44.3808, 'eval_samples_per_second': 216.332, 'eval_steps_per_second': 13.542, 'epoch': 5.0}
{'train_runtime': 1221.9831, 'train_samples_per_second': 39.285, 'train_steps_per_second': 2.459, 'train_loss': 0.17479759123139146, 'epoch': 5.0}


100%|██████████| 601/601 [00:44<00:00, 13.64it/s]

Training Accuracy: 0.9955



  plt.show()


In [5]:
# Specify the directory to save the model and tokenizer
save_directory = "D:/BTP_2/CrisisMMD_v2.0/model_text"

# Save the model
modeltext.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Model and tokenizer saved to D:/BTP_2/CrisisMMD_v2.0/model_text
