<a href="https://colab.research.google.com/github/Anuargharsh/Movie/blob/main/7thOctNer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter

In [4]:
df1 = pd.read_csv('/content/G1 - G1.csv.csv')
df2 = pd.read_csv('/content/G2 - G2.csv.csv')
df3 = pd.read_csv('/content/G3 - G3.csv.csv')

In [5]:
# Preprocessing function for the tags
def preprocess_tags(tags):
    if pd.isna(tags):
        return []
    tag_list = tags.split(',')
    processed_tags = []
    for tag in tag_list:
        if tag:
            start, end, label = tag.split(':')
            processed_tags.append((int(start), int(end), label))
    return processed_tags

In [6]:
# Apply preprocessing to all datasets
df1['tags'] = df1['tags'].apply(preprocess_tags)
df2['tags'] = df2['tags'].apply(preprocess_tags)
df3['tags'] = df3['tags'].apply(preprocess_tags)

In [7]:
# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [8]:
# Custom NER Dataset class
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, label2id, max_len=128):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tags = self.tags[idx]

        encoding = self.tokenizer(text,
                                  truncation=True,
                                  padding='max_length',
                                  max_length=self.max_len,
                                  return_offsets_mapping=True,
                                  return_tensors='pt')

        labels = ['O'] * len(encoding['input_ids'][0])
        offsets = encoding['offset_mapping'][0]

        for start, end, label in tags:
            for idx, (offset_start, offset_end) in enumerate(offsets):
                if offset_start == start:
                    labels[idx] = 'B-' + label
                elif offset_start > start and offset_start < end:
                    labels[idx] = 'I-' + label

        labels = [self.label2id[label] for label in labels]
        encoding['labels'] = torch.tensor(labels, dtype=torch.long)

        return {key: val.squeeze(0) for key, val in encoding.items()}

In [9]:
# Ensure that 'tags' columns are lists, and filter out any NaN or None values
tags_combined = df1['tags'].dropna().tolist() + df2['tags'].dropna().tolist() + df3['tags'].dropna().tolist()

# Flatten the list of tags while ensuring they are iterable
unique_labels = set(
    label for tags in tags_combined if isinstance(tags, list)  # Check if 'tags' is a list
    for _, _, label in tags if len(tags) >= 3  # Ensure there are at least 3 elements
)

# Create label mappings
label2id = {f'B-{label}': idx for idx, label in enumerate(unique_labels, 1)}
label2id.update({f'I-{label}': idx + len(unique_labels) for idx, label in enumerate(unique_labels, 1)})
label2id['O'] = 0
id2label = {v: k for k, v in label2id.items()}


In [10]:
# Function to train and evaluate a model on a dataset
def train_model(train_texts, train_tags, val_texts, val_tags, label2id, tokenizer, num_labels, device, model_name):
    # Create Dataset objects
    train_dataset = NERDataset(train_texts, train_tags, tokenizer, label2id)
    val_dataset = NERDataset(val_texts, val_tags, tokenizer, label2id)

    # Create DataLoader objects
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Load pre-trained BERT model for token classification
    model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    model.to(device)

    # Set up the optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)
    # Training loop
    model.train()
    for epoch in range(3):  # Train for 3 epochs per task
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}, Loss: {avg_train_loss}')
        # Save model after training on the task
    model.save_pretrained(f'ner_model_{model_name}')
    tokenizer.save_pretrained(f'ner_model_{model_name}')

     # Evaluation
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            all_preds.extend(predictions.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

    # Remove ignored index (pad tokens)
    true_labels = [id2label[label] for label in all_labels if label != -100]
    pred_labels = [id2label[pred] for pred in all_preds if pred != -100]

    print(f"\nClassification Report for {model_name}:\n")
    print(classification_report(true_labels, pred_labels))




In [11]:
# Split datasets into train-test and retain 100 samples from previous tasks
df1_train, df1_test = train_test_split(df1, test_size=0.2, random_state=42)
df2_train, df2_test = train_test_split(df2, test_size=0.2, random_state=42)
df3_train, df3_test = train_test_split(df3, test_size=0.2, random_state=42)

# Retain 100 examples from G1 and G2 for continual learning
df1_sample = df1_train.sample(100, random_state=42)
df2_sample = df2_train.sample(100, random_state=42)

# Continual Learning Process
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Task T1: Train on G1
train_model(df1_train['text'].tolist(), df1_train['tags'].tolist(),
            df1_test['text'].tolist(), df1_test['tags'].tolist(),
            label2id, tokenizer, num_labels=len(label2id), device=device, model_name='task_1')

# Task T2: Train on G2 with 100 examples from G1
combined_g1_g2 = pd.concat([df2_train, df1_sample])
train_model(combined_g1_g2['text'].tolist(), combined_g1_g2['tags'].tolist(),
            df2_test['text'].tolist(), df2_test['tags'].tolist(),
            label2id, tokenizer, num_labels=len(label2id), device=device, model_name='task_2')

# Task T3: Train on G3 with 100 examples from G1 and G2
combined_g1_g2_g3 = pd.concat([df3_train, df1_sample, df2_sample])
train_model(combined_g1_g2_g3['text'].tolist(), combined_g1_g2_g3['tags'].tolist(),
            df3_test['text'].tolist(), df3_test['tags'].tolist(),
            label2id, tokenizer, num_labels=len(label2id), device=device, model_name='task_3')

# Combined Training: Train on G1 + G2 + G3
combined_all = pd.concat([df1, df2, df3])
train_model(combined_all['text'].tolist(), combined_all['tags'].tolist(),
            combined_all.sample(frac=0.2, random_state=42)['text'].tolist(),
            combined_all.sample(frac=0.2, random_state=42)['tags'].tolist(),
            label2id, tokenizer, num_labels=len(label2id), device=device, model_name='combined')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.0899564778715696
Epoch 2, Loss: 0.03411066871890596
Epoch 3, Loss: 0.021974838276788512

Classification Report for task_1:



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                   precision    recall  f1-score   support

   B-allergy_name       0.00      0.00      0.00         1
         B-cancer       0.84      0.81      0.82        26
B-chronic_disease       0.81      0.78      0.79        37
      B-treatment       0.62      0.81      0.70        36
   I-allergy_name       0.83      0.80      0.81       197
         I-cancer       0.87      0.83      0.85      1376
I-chronic_disease       0.87      0.91      0.89      3890
      I-treatment       0.83      0.88      0.85      4047
                O       1.00      0.99      0.99    178806

         accuracy                           0.99    188416
        macro avg       0.74      0.76      0.75    188416
     weighted avg       0.99      0.99      0.99    188416



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.09528784958877824
Epoch 2, Loss: 0.03786908833191116
Epoch 3, Loss: 0.02504933930333353

Classification Report for task_2:



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                   precision    recall  f1-score   support

   B-allergy_name       0.00      0.00      0.00         1
         B-cancer       0.73      0.61      0.67        18
B-chronic_disease       0.91      0.67      0.77        43
      B-treatment       0.68      0.68      0.68        19
   I-allergy_name       0.79      0.81      0.80       251
         I-cancer       0.84      0.86      0.85      1176
I-chronic_disease       0.87      0.88      0.87      3247
      I-treatment       0.90      0.83      0.87      4034
                O       0.99      0.99      0.99    156459

         accuracy                           0.99    165248
        macro avg       0.75      0.70      0.72    165248
     weighted avg       0.99      0.99      0.99    165248



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.1046396903380387
Epoch 2, Loss: 0.03995587453189155
Epoch 3, Loss: 0.025018253481864292

Classification Report for task_3:



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                   precision    recall  f1-score   support

   B-allergy_name       0.00      0.00      0.00         2
         B-cancer       0.90      0.68      0.78        28
B-chronic_disease       0.82      0.66      0.73        47
      B-treatment       0.77      0.74      0.75        31
   I-allergy_name       0.86      0.64      0.74       267
         I-cancer       0.80      0.90      0.85      1311
I-chronic_disease       0.83      0.90      0.86      3474
      I-treatment       0.88      0.87      0.87      4033
                O       0.99      0.99      0.99    151575

         accuracy                           0.99    160768
        macro avg       0.76      0.71      0.73    160768
     weighted avg       0.99      0.99      0.99    160768



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.05683316448243679
Epoch 2, Loss: 0.030007221394182913
Epoch 3, Loss: 0.02331270580254139

Classification Report for combined:

                   precision    recall  f1-score   support

   B-allergy_name       0.83      0.83      0.83         6
         B-cancer       1.00      0.70      0.82        73
B-chronic_disease       0.87      0.95      0.91       130
      B-treatment       0.87      0.84      0.86        82
   I-allergy_name       0.90      0.94      0.92       778
         I-cancer       0.93      0.94      0.94      3870
I-chronic_disease       0.91      0.96      0.94     10237
      I-treatment       0.95      0.96      0.95     11727
                O       1.00      1.00      1.00    487401

         accuracy                           0.99    514304
        macro avg       0.92      0.90      0.91    514304
     weighted avg       0.99      0.99      0.99    514304

