In [1]:
 #install packages
!pip install transformers
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
import random

import logging
import random
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, accuracy_score


def custom_train_test_split(df, test_size=0.2, random_state=None):
    np.random.seed(random_state)
    
    # Extract features and labels
    X = df[['ade', 'soc_code']]
    y = df['label']
    
    # Identify classes and their counts
    classes, counts = np.unique(y, return_counts=True)
    
    # Identify small classes
    small_classes = classes[counts < 5]
    
    # Initialize lists for train and test sets
    X_train_list = []
    y_train_list = []
    X_test_list = []
    y_test_list = []
    train_indices = []
    test_indices = []
    
    # Handle small classes separately
    for cls in small_classes:
        cls_mask = (y == cls)
        cls_X = X[cls_mask]
        cls_y = y[cls_mask]
        cls_idx = df.index[cls_mask].tolist()
        
        if len(cls_X) == 1:
            # If only one instance, put it in test set
            test_indices.append(cls_idx[0])
        else:
            # Randomly choose one instance for testing
            test_idx = np.random.choice(len(cls_X))
            test_indices.append(cls_idx[test_idx])
            
            # Remaining instances go to training
            train_indices.extend(np.delete(cls_idx, test_idx))
    
    # Combine the small class data into test and train sets
    test_indices = np.array(test_indices)
    train_indices = np.array(train_indices)
    
    X_test = df.loc[test_indices]
    y_test = X_test['label']
    
    X_train = df.loc[train_indices]
    y_train = X_train['label']
    
    # Handle large classes with stratified split
    large_class_mask = ~np.isin(y, small_classes)
    X_large = X[large_class_mask]
    y_large = y[large_class_mask]
    
    X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(
        X_large, y_large, test_size=test_size, random_state=random_state, stratify=y_large
    )
    
    # Combine large class data with the small class data
    X_train = pd.concat([X_train, X_train_large], axis=0)
    y_train = pd.concat([y_train, y_train_large], axis=0)
    
    X_test = pd.concat([X_test, X_test_large], axis=0)
    y_test = pd.concat([y_test, y_test_large], axis=0)
    
    return X_train, X_test, y_train, y_test

#evaluation
def accuracy_per_class(predictions, true_vals):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()

    accuracy_dict = {}
    count_dict = {}

    for label in np.unique(labels_flat):
        y_preds = pred_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        accuracy_dict[label] = np.sum(y_preds == y_true) / len(y_true) if len(y_true) > 0 else 0
        count_dict[label] = len(y_true)

    return accuracy_dict, count_dict

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')


# Function to calculate precision, recall, and F1 for each label
def calculate_metrics(predictions, true_vals):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()
    
    # Calculate precision, recall, and F1 score per label
    precision, recall, f1, _ = precision_recall_fscore_support(labels_flat, pred_flat, average=None, labels=np.unique(labels_flat))
    
    return precision, recall, f1
    

# Configure logging
logging.basicConfig(filename='cadec_all_training_40ep_16bs_5e-5lr_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

class TQDMLoggingWrapper(tqdm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger = logger

    def display(self, msg=None, pos=None):
        if msg is not None:
            self.logger.info(msg)
        super().display(msg, pos)

    def update(self, n=1):
        super().update(n)
        desc = self.format_dict.get('desc', 'No description')
        postfix = self.format_dict.get('postfix', '')
        self.logger.info(f'{desc} - {postfix}')

    def set_description(self, desc=None, refresh=True):
        super().set_description(desc, refresh)
        if desc:
            self.logger.info(f'Set description: {desc}')


# Function to evaluate the model
def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


#Read data from git:
#https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Data/CADEC/3.csv
# URL of the CSV file
cadec_csv_url = "https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Data/CADEC/3.csv"
# read data from smm4h
smm4h_csv_url = "https://raw.githubusercontent.com/FANMISUA/ADE_Norm/main/Data/smm4h_soc.tsv"

allSMM4H = [10037175, 10018065,10029205, 10017947, 10028395, 10022891, 10027433, 10040785, 10038738, 10022117, 10015919, 10038604, 10047065, 
            10021428,10041244, 10007541, 10038359, 10021881, 10013993, 10019805, 10042613, 10029104, 10077536, 10010331, 10014698]

label_dict = {
    10037175: 0,
    10018065: 1,
    10029205: 2,
    10017947: 3,
    10028395: 4,
    10022891: 5,
    10027433: 6,
    10040785: 7,
    10038738: 8,
    10022117: 9,
    10015919: 10,
    10038604: 11,
    10047065: 12,
    10021428: 13,
    10041244: 14,
    10007541: 15,
    10038359: 16,
    10021881: 17,
    10013993: 18,
    10019805: 19,
    10042613: 20,
    10029104: 21,
    10077536: 22,
    10010331: 23,
    10014698: 24
}


# Read the CSV file into a pandas DataFrame
column_names = ["ade", "soc_code"]
smm4h_all = pd.read_csv(smm4h_csv_url,names=column_names, sep = '\t', header=None)
print("smm4h data:",smm4h_all.shape)

smm4h_all['soc_code'] = pd.to_numeric(smm4h_all['soc_code'], errors='coerce').astype('Int64')
smm4h_all = smm4h_all[smm4h_all['soc_code'] != 0]

smm4h_unique = smm4h_all.drop_duplicates(subset='ade')

print("smm4h data after filtering:",smm4h_all.shape)
smm4h_soc_code_counts = smm4h_unique['soc_code'].value_counts()
# Sort the counts from high to low and print the result
print("SOC count in SMM4H: ",smm4h_soc_code_counts)
# Filter DataFrame
smm4h_filtered_data3 = smm4h_unique[smm4h_unique['soc_code'].isin(allSMM4H)]
# filtered_data6 = cadec_unique[cadec_unique['soc_code'].isin(top6SMM4H)]

# Select only the Term and SOC columns
allinSMM4H = smm4h_filtered_data3[['ade', 'soc_code']]
# CADECtop6inSMM4H = filtered_data6[['ade', 'soc_code']]

# Read the CSV file into a pandas DataFrame
column_names = ["TT", "llt_code", "ade", "soc_code"]
cadec_all = pd.read_csv(cadec_csv_url,names=column_names, header=None)

# Remove duplicate rows based on the 'ade' column
cadec_unique = cadec_all.drop_duplicates(subset='ade')

# Display the resulting DataFrame
# print("clean cadec data:",cadec_unique.shape)
# Count occurrences of each 'soc_code'
cadec_soc_code_counts = cadec_unique['soc_code'].value_counts()
# Sort the counts from high to low and print the result
print("SOC count in CADEC: ",cadec_soc_code_counts)


# Filter DataFrame
cadec_filtered_data3 = cadec_unique[cadec_unique['soc_code'].isin(allSMM4H)]
# filtered_data6 = cadec_unique[cadec_unique['soc_code'].isin(top6SMM4H)]

# Select only the Term and SOC columns
CADECallinSMM4H = cadec_filtered_data3[['ade', 'soc_code']]
# CADECtop6inSMM4H = filtered_data6[['ade', 'soc_code']]


# For SMM4H data
df1 = allinSMM4H.copy()
df1.loc[:, 'label'] = df1['soc_code'].map(label_dict)

# For CADEC data
df2 = CADECallinSMM4H.copy()
df2.loc[:, 'label'] = df2['soc_code'].map(label_dict)

print("SMM4H :",df1)
print("CADEC :",df2)

#cadec data
df = df2

# Define the random seeds and other parameters
seed_values = list(range(2, 42, 2))
batch_size = 16
epochs = 40
learningrate = 5e-5

# Placeholder for accuracies
all_accuracies = {label: [] for label in range(len(label_dict))}

# Initialize dictionaries to hold metrics for each seed
# seed_metrics = {seed_val: {'precision': [], 'recall': [], 'f1': []} for seed_val in seed_values}
seed_metrics = {seed_val: {'precision': [], 'recall': [], 'f1': [], 'accuracy': [], 'confusion_matrix': []} for seed_val in seed_values}


# Main loop over seed values
for seed_val in seed_values:
    # Set seeds
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Data preparation
    # Apply the custom train-test split
    X_train, X_val, y_train, y_val = custom_train_test_split(df, test_size=0.2, random_state=seed_val)
    
    # Add data_type column
    df['data_type'] = 'not_set'
    df.loc[X_train.index, 'data_type'] = 'train'
    df.loc[X_val.index, 'data_type'] = 'val'

    logger.info(df.groupby(['soc_code', 'label', 'data_type']).count())
    print(df.groupby(['soc_code', 'label', 'data_type']).count())

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    encoded_data_train = tokenizer.batch_encode_plus(
        df[df.data_type == 'train'].ade.values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        df[df.data_type == 'val'].ade.values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)

    dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
    dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=learningrate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    logger.info(f"Device used: {device}")

    # Training loop
    for epoch in TQDMLoggingWrapper(range(1, epochs+1), desc='Epoch Progress'):
        model.train()
        loss_train_total = 0

        progress_bar = TQDMLoggingWrapper(dataloader_train, desc=f'Epoch {epoch}', leave=False, disable=False)
        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

            outputs = model(**inputs)
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})

        # torch.save(model.state_dict(), f'./ADENorm_top3_epoch_{epoch}.model')

        logger.info(f'\nEpoch {epoch}')
        loss_train_avg = loss_train_total / len(dataloader_train)
        logger.info(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    precision, recall, f1, _ = precision_recall_fscore_support(true_vals.flatten(), np.argmax(predictions, axis=1).flatten(), average=None, labels=np.unique(true_vals.flatten()))

     # Ensure that you use `true_vals` for the true labels
    predicted_labels = np.argmax(predictions, axis=1).flatten()
    true_labels = true_vals.flatten()
    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    seed_metrics[seed_val]['accuracy'] = accuracy

    # Generate confusion matrix
    conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=np.unique(true_labels))
    seed_metrics[seed_val]['confusion_matrix'] = conf_matrix
    
    for label in np.unique(true_vals):
        seed_metrics[seed_val]['precision'].append((label, precision[label]))
        seed_metrics[seed_val]['recall'].append((label, recall[label]))
        seed_metrics[seed_val]['f1'].append((label, f1[label]))

# Write the precision, recall, F1 scores, and seed values to a file
with open('cadec_all_20times_results_with_seeds.txt', 'w') as f:
    f.write('Seed\tLabel\tPrecision\tRecall\tF1\tAccuracy\n')
    for seed_val in seed_values:
        for label, precision_val in seed_metrics[seed_val]['precision']:
            recall_val = next(val for lbl, val in seed_metrics[seed_val]['recall'] if lbl == label)
            f1_val = next(val for lbl, val in seed_metrics[seed_val]['f1'] if lbl == label)
            accuracy = seed_metrics[seed_val]['accuracy']
            f.write(f'{seed_val}\t{label}\t{precision_val:.4f}\t{recall_val:.4f}\t{f1_val:.4f}\t{accuracy:.4f}\n')

        # Save the confusion matrix
        f.write(f'\nConfusion Matrix for Seed {seed_val}:\n')
        f.write(np.array2string(seed_metrics[seed_val]['confusion_matrix'], separator=', '))
        f.write('\n')


# Initialize lists to hold precision, recall, and f1 values for each label
precision_dict, recall_dict, f1_dict = {}, {}, {}

# Collect metrics across seeds
for seed in seed_metrics:
    for label, value in seed_metrics[seed]['precision']:
        precision_dict.setdefault(label, []).append(value)
    for label, value in seed_metrics[seed]['recall']:
        recall_dict.setdefault(label, []).append(value)
    for label, value in seed_metrics[seed]['f1']:
        f1_dict.setdefault(label, []).append(value)

# Compute mean and std for precision, recall, and f1
labels = sorted(precision_dict.keys())
precision_mean = [np.mean(precision_dict[label]) for label in labels]
precision_std = [np.std(precision_dict[label]) for label in labels]
recall_mean = [np.mean(recall_dict[label]) for label in labels]
recall_std = [np.std(recall_dict[label]) for label in labels]
f1_mean = [np.mean(f1_dict[label]) for label in labels]
f1_std = [np.std(f1_dict[label]) for label in labels]

# Plotting
x = np.arange(len(labels))  # label indices
width = 0.25  # width of the bars

fig, ax = plt.subplots(figsize=(12, 8))

# Bar plots with mean values
bars_precision = ax.bar(x - width, precision_mean, width, label='Precision', color='b')
bars_recall = ax.bar(x, recall_mean, width, label='Recall', color='g')
bars_f1 = ax.bar(x + width, f1_mean, width, label='F1 Score', color='r')

# # Annotate bars with mean and std values
# Annotate bars with mean and std values, with smaller font size
# for bars, means, stds in zip([bars_precision, bars_recall, bars_f1],
#                              [precision_mean, recall_mean, f1_mean],
#                              [precision_std, recall_std, f1_std]):
#     for bar, mean, std in zip(bars, means, stds):
#         height = bar.get_height()
#         ax.text(bar.get_x() + bar.get_width() / 2.0, height,
#                 f'{mean:.2f}\n±{std:.2f}', ha='center', va='bottom', fontsize=8)  # Smaller font size


# Labels and title
ax.set_xlabel('Label')
ax.set_ylabel('Performance')
ax.set_title('Mean and Standard Deviation of Precision, Recall, and F1 Score by Label')
ax.set_xticks(x)
ax.set_xticklabels(labels)

# Set y-axis limit to [0, 1]
ax.set_ylim(0, 1)

# Move legend outside the plot
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Show plot
plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust the plot to fit the legend
plt.savefig('CADEC_all_20times_results_plot_fix.png')
plt.show()




URLError: <urlopen error [Errno 11001] getaddrinfo failed>