In [1]:
#install packages
!pip install transformers
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split

import numpy as np
import random

#Read smm4h data from git:
#https://raw.githubusercontent.com/FANMISUA/ADE_Norm/main/Data/smm4h_soc.tsv
# URL of the CSV file
# csv_url = "https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Data/CADEC/3.csv"
csv_url = "https://raw.githubusercontent.com/FANMISUA/ADE_Norm/main/Data/smm4h_soc.tsv"

# Read the CSV file into a pandas DataFrame
column_names = ["ade", "soc_code"]
smm4h_all = pd.read_csv(csv_url,names=column_names, sep = '\t', header=None)

smm4h_all = smm4h_all[smm4h_all['soc_code'] != 0]
smm4h_all['soc_code'] = pd.to_numeric(smm4h_all['soc_code'], errors='coerce').astype('Int64')

print(smm4h_all.head)
# Display the first few rows of the DataFrame
print(smm4h_all.shape)


# Remove duplicate rows based on the 'ade' column
# smm4h_unique = smm4h_all
smm4h_unique = smm4h_all.drop_duplicates(subset='ade')


# Display the resulting DataFrame
print(smm4h_unique.shape)
# Count occurrences of each 'soc_code'
soc_code_counts = smm4h_unique['soc_code'].value_counts()
# Sort the counts from high to low and print the result
print("SOC count in CADEC: ",soc_code_counts)

#get top 3 of the SMM4H list
#['10037175','10018065','10029205','10017947''10028395','10022891']
# top6SMM4H = [10018065,10037175,10029205,10022891,10028395,10017947]
top3SMM4H = [10037175, 10018065,10029205]
# top3SMM4H = ['10018065', '10037175', '10029205']

top3label_dict = {
    'Label': [0, 1, 2],
    'soc_code': [10037175,10018065,10029205]
}
top3label_dict = {
    10037175: 0,
    10018065: 1,
    10029205: 2
}


# Filter DataFrame
filtered_data3 = smm4h_unique[smm4h_unique['soc_code'].isin(top3SMM4H)]
# filtered_data6 = cadec_unique[cadec_unique['soc_code'].isin(top6SMM4H)]

# Select only the Term and SOC columns
top3inSMM4H = filtered_data3[['ade', 'soc_code']]
# CADECtop6inSMM4H = filtered_data6[['ade', 'soc_code']]

print("CADEC top3 in SMM4H:",top3inSMM4H)
data = top3inSMM4H
# Convert to DataFrame
df = pd.DataFrame(top3inSMM4H)

#mapping dictionary for soc to label
# soc_code_to_label = dict(zip(top3label_dict['soc_code'], top3label_dict['Label']))
df['label'] = df['soc_code'].map(top3label_dict)

# Replace `soc_code` values with corresponding labels
# df['label'] = df['soc_code'].replace(soc_code_to_label)
# print(top3label_dict)
print(df)


def custom_train_test_split(X, y, test_size=0.2, random_state=None):
    classes, counts = np.unique(y, return_counts=True)
    min_class_count = min(counts)

    # Find classes with only one or two instances
    single_or_double_instance_classes = classes[np.logical_or(counts == 1, counts == 2)]

    # Remove instances of single-instance or two-instance classes
    X_filtered = X[~np.isin(y, single_or_double_instance_classes)]
    y_filtered = y[~np.isin(y, single_or_double_instance_classes)]

    if len(y_filtered) < 2:
        raise ValueError("No classes have more than two instances after filtering.")

    # Perform stratified split on the filtered dataset
    X_train, X_val, y_train, y_val = train_test_split(X_filtered, y_filtered, test_size=test_size, random_state=random_state, stratify=y_filtered)

    # Randomly assign instances of single-instance classes to training or testing sets
    for class_label in single_or_double_instance_classes:
        class_indices = np.where(y == class_label)[0]
        np.random.shuffle(class_indices)

        if len(class_indices) <= 2:
            # Only one instance, randomly assign to training or testing set
            if np.random.rand() < test_size:
                X_val = np.concatenate((X_val, X[class_indices]))
                y_val = np.concatenate((y_val, y[class_indices]))
            else:
                X_train = np.concatenate((X_train, X[class_indices]))
                y_train = np.concatenate((y_train, y[class_indices]))

    return X_train, X_val, y_train, y_val


#evaluation
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')


def accuracy_per_class(predictions, true_vals):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()

    accuracy_dict = {}
    count_dict = {}

    for label in np.unique(labels_flat):
        y_preds = pred_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        accuracy_dict[label] = np.sum(y_preds == y_true) / len(y_true) if len(y_true) > 0 else 0
        count_dict[label] = len(y_true)

    return accuracy_dict, count_dict

<bound method NDFrame.head of                             ade  soc_code
0                           ade      <NA>
1                     allergies  10021428
2               HURT YOUR Liver  10019805
3                            AD  10037175
4                         focus  10029205
...                         ...       ...
1707                     orgasm  10037175
1708  never have another orgasm  10037175
1709                       coma  10029205
1710        gain so much weight  10022891
1711         increase my weight  10022891

[1712 rows x 2 columns]>
(1712, 2)
(1107, 2)
SOC count in CADEC:  soc_code
10037175    287
10018065    235
10029205    212
10017947     63
10028395     58
10022891     54
10027433     48
10040785     28
10038738     22
10022117     16
10015919     16
10038604     10
10047065     10
10021428      8
10007541      7
10041244      7
10038359      6
10021881      5
10013993      4
10019805      2
10042613      2
10029104      2
10010331      1
10077536      1
0     

In [None]:
import logging
import random
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score as f1_score_func
from tqdm import tqdm

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Configure logging
logging.basicConfig(filename='smm4h_top3_20times_training_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

class TQDMLoggingWrapper(tqdm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger = logger

    def display(self, msg=None, pos=None):
        if msg is not None:
            self.logger.info(msg)
        super().display(msg, pos)

    def update(self, n=1):
        super().update(n)
        desc = self.format_dict.get('desc', 'No description')
        postfix = self.format_dict.get('postfix', '')
        self.logger.info(f'{desc} - {postfix}')

    def set_description(self, desc=None, refresh=True):
        super().set_description(desc, refresh)
        if desc:
            self.logger.info(f'Set description: {desc}')


# Define the random seeds and other parameters
seed_values = list(range(2, 4, 2))
# batch_size = 8
# epochs = 10
# Define parameter grid
learning_rates = [1e-5, 1e-4, 1e-3]
batch_sizes = [4, 8, 32, 64]
epochs_list = [5, 10, 15, 20]

# Results storage
results = []

# Placeholder for accuracies
all_accuracies = {label: [] for label in range(len(top3label_dict))}

# Function to evaluate the model
def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

# Main loop over seed values
for seed_val in seed_values:
  # Set seeds
  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

  # Data preparation
  X_train, X_val, y_train, y_val = custom_train_test_split(df.index.values, df.label.values, test_size=0.2, random_state=seed_val)
  df['data_type'] = ['not_set'] * df.shape[0]
  df.loc[X_train, 'data_type'] = 'train'
  df.loc[X_val, 'data_type'] = 'val'
  # logger.info(df.groupby(['soc_code', 'label', 'data_type']).count())

  # Training loop for grid search
  for lr in learning_rates:
      for batch_size in batch_sizes:
          for epochs in epochs_list:
            logger.info(f"Seed: {seed_val}, Learning Rate: {lr}, Batch Size: {batch_size}, Epochs: {epochs}")
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
            encoded_data_train = tokenizer.batch_encode_plus(
                df[df.data_type == 'train'].ade.values,
                add_special_tokens=True,
                return_attention_mask=True,
                pad_to_max_length=True,
                max_length=256,
                return_tensors='pt'
            )

            encoded_data_val = tokenizer.batch_encode_plus(
                df[df.data_type == 'val'].ade.values,
                add_special_tokens=True,
                return_attention_mask=True,
                pad_to_max_length=True,
                max_length=256,
                return_tensors='pt'
            )

            input_ids_train = encoded_data_train['input_ids']
            attention_masks_train = encoded_data_train['attention_mask']
            labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

            input_ids_val = encoded_data_val['input_ids']
            attention_masks_val = encoded_data_val['attention_mask']
            labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

            dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
            dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

            model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(top3label_dict), output_attentions=False, output_hidden_states=False)

            dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
            dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

            optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs)

            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model.to(device)
            logger.info(f"Device used: {device}")

            # Training loop
            for epoch in TQDMLoggingWrapper(range(1, epochs+1), desc='Epoch Progress'):
                model.train()
                loss_train_total = 0

                progress_bar = TQDMLoggingWrapper(dataloader_train, desc=f'Epoch {epoch}', leave=False, disable=False)
                for batch in progress_bar:
                    model.zero_grad()
                    batch = tuple(b.to(device) for b in batch)
                    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

                    outputs = model(**inputs)
                    loss = outputs[0]
                    loss_train_total += loss.item()
                    loss.backward()

                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    scheduler.step()

                    progress_bar.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})

                # torch.save(model.state_dict(), f'./ADENorm_top3_epoch_{epoch}.model')

                # logger.info(f'\nEpoch {epoch}')
                loss_train_avg = loss_train_total / len(dataloader_train)
                # logger.info(f'Training loss: {loss_train_avg}')

                val_loss, predictions, true_vals = evaluate(dataloader_validation)
                val_f1 = f1_score_func(true_vals, np.argmax(predictions, axis=1), average='weighted')
                # logger.info(f'Validation loss: {val_loss}')
                # logger.info(f'F1 Score (Weighted): {val_f1}')

            _, predictions, true_vals = evaluate(dataloader_validation)
            accuracy_dict, count_dict = accuracy_per_class(predictions, true_vals)

            for label, accuracy in accuracy_dict.items():
                all_accuracies[label].append(accuracy)


            # Calculate the average accuracy for each label
            avg_accuracy = {label: np.mean(accs) for label, accs in all_accuracies.items()}

            # Calculate the overall average accuracy across all labels
            overall_avg_accuracy = np.mean(list(avg_accuracy.values()))

            logger.info(f'Seed {seed_val} - Accuracy: {overall_avg_accuracy} - Count: {count_dict} - lr: {lr} -batchsize:{batch_size} -epochs:{epochs}')
            #store results
            results.append((lr, batch_size, epochs, overall_avg_accuracy))


# Extract each parameter and accuracy for plotting
learning_rates = [result[0] for result in results]
batch_sizes = [result[1] for result in results]
epochs = [result[2] for result in results]
accuracies = [result[3] for result in results]

# Find the best result based on accuracy
best_result = max(results, key=lambda x: x[3])
print(f"Best result: LR={best_result[0]}, Batch={best_result[1]}, Epoch={best_result[2]}, Accuracy={best_result[3]:.4f}")

# Create a 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the data points
sc = ax.scatter(learning_rates, batch_sizes, epochs, c=accuracies, cmap='viridis', s=100, edgecolors='k')

# Add color bar
cbar = plt.colorbar(sc)
cbar.set_label('Overall Average Accuracy')

# Set labels
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Batch Size')
ax.set_zlabel('Epochs')

# Title
ax.set_title('Hyperparameter Tuning Results for smm4h top3')

# Save the plot to a file
plt.savefig("hyperparameter_tuning_3d_plot.png", dpi=300, bbox_inches='tight')

# Show the plot
plt.show()


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 1:   0%|          | 0/147 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 0/147 [00:00<?, ?it/s, training_loss=0.371][A
Epoch 1:   1%|          | 1/147 [00:00<00:44,  3.25it/s, training_loss=0.371][A
Epoch 1:   1%|          | 1/147 [00:00<00:44,  3.25it/s, training_loss=0.3