**Prepare Dataset**

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
dataset_file1 = '/content/gdrive/MyDrive/BERT Action Items/dataset/balanced dataset/balanced_preprocessed_147s+126s+230rm+600f+250f_train.csv'
dataset_file2 = '/content/gdrive/MyDrive/BERT Action Items/dataset/balanced dataset/balanced_preprocessed_147s+126s+230rm+600f+250f_val.csv'
dataset1 = load_dataset('csv', data_files=dataset_file1)
dataset2 = load_dataset('csv', data_files=dataset_file2)

train_dataset = dataset1['train']
val_dataset = dataset2['train']

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
!pip install transformers==4.28.0


In [None]:
from transformers import BertTokenizer, AutoTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [None]:
def tokenize(batch):

  return tokenizer(batch["text"], padding="max_length", truncation=True,max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=64)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))


**Train**

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingu

In [None]:
output_dir1 = '/content/gdrive/MyDrive/BERT Action Items/BERT_2e_balanced_AI'

In [None]:
!pip install accelerate -U

In [None]:
import numpy as np
from datasets import load_metric


In [None]:
def custom_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")


    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    with open('./pred_labels.txt', 'w') as f:
      for line in predictions:
        f.write(str(line))
        f.write('\n')
    with open('./actual_labels.txt', 'w') as f:
      for line in labels:
        f.write(str(line))
        f.write('\n')
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=output_dir1, evaluation_strategy="epoch",num_train_epochs=2, per_device_train_batch_size=4, per_device_eval_batch_size=4,learning_rate=5e-5,weight_decay=0.05)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics =custom_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(output_dir1 + '/model')

In [None]:
test_file ='/content/gdrive/MyDrive/BERT Action Items/dataset/updated_Testing_verified_meetings_10.csv'
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True,max_length=512)

testset = load_dataset('csv', data_files=test_file)
test_dataset = testset['train']
test_dataset = test_dataset.map(tokenize, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

In [None]:
trainer.predict(test_dataset)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load predicted labels
with open('pred_labels.txt', 'r') as f:
    pred_labels = [int(line.strip()) for line in f]

# Load actual labels
with open('actual_labels.txt', 'r') as f:
    actual_labels = [int(line.strip()) for line in f]

# Calculate confusion matrix
cm = confusion_matrix(actual_labels, pred_labels)

# Generate classification report
report = classification_report(actual_labels, pred_labels)

print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(report)
