In [1]:
#install packages
!pip install transformers
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split

import numpy as np
import random


#Read data from git:
#https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Data/CADEC/3.csv
# URL of the CSV file
csv_url = "https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Data/CADEC/3.csv"
# read data from smm4h
smm4h_csv_url = "https://raw.githubusercontent.com/FANMISUA/ADE_Norm/main/Data/smm4h_soc.tsv"

top3SMM4H = [10037175, 10018065,10029205]
top3label_dict = {
    10037175: 0,
    10018065: 1,
    10029205: 2
}


# Read the CSV file into a pandas DataFrame
column_names = ["ade", "soc_code"]
smm4h_all = pd.read_csv(smm4h_csv_url,names=column_names, sep = '\t', header=None)

smm4h_all = smm4h_all[smm4h_all['soc_code'] != 0]
smm4h_all['soc_code'] = pd.to_numeric(smm4h_all['soc_code'], errors='coerce').astype('Int64')
smm4h_unique = smm4h_all.drop_duplicates(subset='ade')

# print("smm4h data:",smm4h_all.shape)
smm4h_soc_code_counts = smm4h_unique['soc_code'].value_counts()
# Sort the counts from high to low and print the result
# print("SOC count in CADEC: ",smm4h_soc_code_counts)
# Filter DataFrame
smm4h_filtered_data3 = smm4h_unique[smm4h_unique['soc_code'].isin(top3SMM4H)]
# filtered_data6 = cadec_unique[cadec_unique['soc_code'].isin(top6SMM4H)]

# Select only the Term and SOC columns
top3inSMM4H = smm4h_filtered_data3[['ade', 'soc_code']]
# CADECtop6inSMM4H = filtered_data6[['ade', 'soc_code']]

top3inSMM4H.loc[:, 'label'] = top3inSMM4H['soc_code'].map(top3label_dict)

print("top3 in SMM4H:",top3inSMM4H)


# Read the CSV file into a pandas DataFrame
column_names = ["TT", "llt_code", "ade", "soc_code"]
cadec_all = pd.read_csv(csv_url,names=column_names, header=None)

# Display the first few rows of the DataFrame
# print("cadec raw data:",cadec_all.shape)


# Remove duplicate rows based on the 'ade' column
cadec_unique = cadec_all.drop_duplicates(subset='ade')

# Display the resulting DataFrame
# print("clean cadec data:",cadec_unique.shape)
# Count occurrences of each 'soc_code'
cadec_soc_code_counts = cadec_unique['soc_code'].value_counts()
# Sort the counts from high to low and print the result
# print("SOC count in CADEC: ",cadec_soc_code_counts)


# Filter DataFrame
cadec_filtered_data3 = cadec_unique[cadec_unique['soc_code'].isin(top3SMM4H)]
# filtered_data6 = cadec_unique[cadec_unique['soc_code'].isin(top6SMM4H)]

# Select only the Term and SOC columns
CADECtop3inSMM4H = cadec_filtered_data3[['ade', 'soc_code']]
# CADECtop6inSMM4H = filtered_data6[['ade', 'soc_code']]


print("CADEC top3 in SMM4H:",CADECtop3inSMM4H)

df1 = top3inSMM4H
df2 = CADECtop3inSMM4H
df1.loc[:, 'label'] = df1['soc_code'].map(top3label_dict)
df2.loc[:, 'label'] = df2['soc_code'].map(top3label_dict)

print("SMM4H top 3",df1)
print("CADEC top 3",df2)
# #modify
# #not combine data; just combine
# # Concatenate the DataFrames
# combined_df = pd.concat([CADECtop3inSMM4H, top3inSMM4H])

# # Remove duplicates
# # combined_df = combined_df.drop_duplicates()
# # Identify duplicates (including the first occurrence)
# duplicates = combined_df[combined_df.duplicated(keep=False)]

# print("Duplicates:")
# print(duplicates)
# # print("Combined top 3 data:",df)

# # Remove duplicates
# combined_df = combined_df.drop_duplicates()
# # df['label'] = df['soc_code'].map(top3label_dict)
# combined_df.loc[:, 'label'] = combined_df['soc_code'].map(top3label_dict)



# print("Combined top 3 data with label:",combined_df)
# df = combined_df

def custom_train_test_split(X, y, test_size=0.2, random_state=None):
    classes, counts = np.unique(y, return_counts=True)
    min_class_count = min(counts)

    # Find classes with only one or two instances
    single_or_double_instance_classes = classes[np.logical_or(counts == 1, counts == 2)]

    # Remove instances of single-instance or two-instance classes
    X_filtered = X[~np.isin(y, single_or_double_instance_classes)]
    y_filtered = y[~np.isin(y, single_or_double_instance_classes)]

    if len(y_filtered) < 2:
        raise ValueError("No classes have more than two instances after filtering.")

    # Perform stratified split on the filtered dataset
    X_train, X_val, y_train, y_val = train_test_split(X_filtered, y_filtered, test_size=test_size, random_state=random_state, stratify=y_filtered)

    # Randomly assign instances of single-instance classes to training or testing sets
    for class_label in single_or_double_instance_classes:
        class_indices = np.where(y == class_label)[0]
        np.random.shuffle(class_indices)

        if len(class_indices) <= 2:
            # Only one instance, randomly assign to training or testing set
            if np.random.rand() < test_size:
                X_val = np.concatenate((X_val, X[class_indices]))
                y_val = np.concatenate((y_val, y[class_indices]))
            else:
                X_train = np.concatenate((X_train, X[class_indices]))
                y_train = np.concatenate((y_train, y[class_indices]))

    return X_train, X_val, y_train, y_val


#evaluation
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')


def accuracy_per_class(predictions, true_vals):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()

    accuracy_dict = {}
    count_dict = {}

    for label in np.unique(labels_flat):
        y_preds = pred_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        accuracy_dict[label] = np.sum(y_preds == y_true) / len(y_true) if len(y_true) > 0 else 0
        count_dict[label] = len(y_true)

    return accuracy_dict, count_dict


top3 in SMM4H:                             ade  soc_code  label
3                            AD  10037175      0
4                         focus  10029205      2
5                          died  10018065      1
8                        dreams  10037175      0
10                   withdrawal  10018065      1
...                         ...       ...    ...
1695       talk a mile a minute  10037175      0
1698     can't go back to sleep  10037175      0
1703                 chest hurt  10018065      1
1704   got ten minutes of sleep  10037175      0
1708  never have another orgasm  10037175      0

[734 rows x 3 columns]
CADEC top3 in SMM4H:                             ade  soc_code
926            voracious hunger  10018065
927            loss of appetite  10018065
929            lack of appetite  10018065
931                    anorexia  10018065
932                    anorexic  10018065
...                         ...       ...
5326  short term memory lacking  10037175
5328      couldn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.loc[:, 'label'] = df2['soc_code'].map(top3label_dict)


In [2]:
import logging
import random
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score as f1_score_func
from tqdm import tqdm

# Configure logging
logging.basicConfig(filename='Combined_top3_20times_training_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

class TQDMLoggingWrapper(tqdm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger = logger

    def display(self, msg=None, pos=None):
        if msg is not None:
            self.logger.info(msg)
        super().display(msg, pos)

    def update(self, n=1):
        super().update(n)
        desc = self.format_dict.get('desc', 'No description')
        postfix = self.format_dict.get('postfix', '')
        self.logger.info(f'{desc} - {postfix}')

    def set_description(self, desc=None, refresh=True):
        super().set_description(desc, refresh)
        if desc:
            self.logger.info(f'Set description: {desc}')


# Define the random seeds and other parameters
seed_values = list(range(2, 42, 2))
batch_size = 64
epochs = 40
learningrate = 1e-4

# Placeholder for accuracies
all_accuracies = {label: [] for label in range(len(top3label_dict))}

# Function to evaluate the model
def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

# Main loop over seed values
for seed_val in seed_values:
    # Set seeds
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # # Data preparation
    # X_train, X_val, y_train, y_val = custom_train_test_split(df.index.values, df.label.values, test_size=0.2, random_state=seed_val)
    # df['data_type'] = ['not_set'] * df.shape[0]
    # df.loc[X_train, 'data_type'] = 'train'
    # df.loc[X_val, 'data_type'] = 'val'
    # # logger.info(df.groupby(['soc_code', 'label', 'data_type']).count())
    
    # Perform train-test split on df1
    X_train_idx1, X_val_idx1, y_train1, y_val1 = custom_train_test_split(df1.index.values, df1.label.values, test_size=0.2, random_state=seed_val)
    
    # Perform train-test split on df2
    X_train_idx2, X_val_idx2, y_train2, y_val2 = custom_train_test_split(df2.index.values, df2.label.values, test_size=0.2, random_state=seed_val)
    
    # Combine the training indices and labels from df1 and df2
    X_train_combined = np.concatenate((X_train_idx1, X_train_idx2))
    y_train_combined = np.concatenate((y_train1, y_train2))
    
    # Combine the validation indices and labels from df1 and df2
    X_val_combined = np.concatenate((X_val_idx1, X_val_idx2))
    y_val_combined = np.concatenate((y_val1, y_val2))
    
    # Optionally, you can set the 'data_type' column for df1 and df2
    df1['data_type'] = 'not_set'
    df2['data_type'] = 'not_set'
    
    df1.loc[X_train_idx1, 'data_type'] = 'train'
    df1.loc[X_val_idx1, 'data_type'] = 'val'
    
    df2.loc[X_train_idx2, 'data_type'] = 'train'
    df2.loc[X_val_idx2, 'data_type'] = 'val'
    
    #  combine df1 and df2 into a single dataframe:
    df = pd.concat([df1, df2])

    # Print the DataFrame with the 'data_type' column
    print("Combined DataFrame with 'data_type' column:\n", df)
        

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    encoded_data_train = tokenizer.batch_encode_plus(
        df[df.data_type == 'train'].ade.values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        df[df.data_type == 'val'].ade.values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(top3label_dict), output_attentions=False, output_hidden_states=False)

    dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
    dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=learningrate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    logger.info(f"Device used: {device}")

    # Training loop
    for epoch in TQDMLoggingWrapper(range(1, epochs+1), desc='Epoch Progress'):
        model.train()
        loss_train_total = 0

        progress_bar = TQDMLoggingWrapper(dataloader_train, desc=f'Epoch {epoch}', leave=False, disable=False)
        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

            outputs = model(**inputs)
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})

        # torch.save(model.state_dict(), f'./ADENorm_top3_epoch_{epoch}.model')

        logger.info(f'\nEpoch {epoch}')
        loss_train_avg = loss_train_total / len(dataloader_train)
        logger.info(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        val_f1 = f1_score_func(true_vals, np.argmax(predictions, axis=1), average='weighted')
        logger.info(f'Validation loss: {val_loss}')
        logger.info(f'F1 Score (Weighted): {val_f1}')

    _, predictions, true_vals = evaluate(dataloader_validation)
    accuracy_dict, count_dict = accuracy_per_class(predictions, true_vals)

    for label, accuracy in accuracy_dict.items():
        all_accuracies[label].append(accuracy)
    logger.info(f'Seed {seed_val} - Accuracy: {accuracy_dict} - Count: {count_dict}')

# Compute average and standard deviation of accuracy
avg_accuracy = {label: np.mean(accs) for label, accs in all_accuracies.items()}
std_accuracy = {label: np.std(accs) for label, accs in all_accuracies.items()}

# Save accuracies to file
with open('Combined_top3_20times_accuracies.txt', 'w') as f:
    # Write header for the accuracies
    f.write('Label\tSeed\tAccuracy\n')
    # Write individual accuracies
    for label in all_accuracies:
        accuracies = all_accuracies[label]
        for i, acc in enumerate(accuracies):
            f.write(f'{label}\tSeed_{i+1}\t{acc:.4f}\n')
    
    # Write the average and standard deviation
    f.write('\nLabel\tAverage Accuracy\tStandard Deviation\n')
    for label in all_accuracies:
        avg_acc = avg_accuracy.get(label, 'N/A')
        std_acc = std_accuracy.get(label, 'N/A')
        f.write(f'{label}\t{avg_acc:.4f}\t{std_acc:.4f}\n')

# Log the final results
logger.info('All accuracies: {}'.format(all_accuracies))
logger.info('Average Accuracy: {}'.format(avg_accuracy))
logger.info('Standard Deviation of Accuracy: {}'.format(std_accuracy))

# print("Average Accuracy:", avg_accuracy)
# print("Standard Deviation of Accuracy:", std_accuracy)


Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0       val
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0       val

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(

Epoch 1:   0%|                                                             | 0/26 [00:01<?, ?it/s, training_loss=0.390][A
Epoch 1:   4%|██        

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0       val
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1       val
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0       val
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0       val
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0       val
10                   withdrawal  10018065      1       val
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1       val
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0       val
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0       val
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0       val
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:00<?, ?it/s, training_loss=0.441][A
Epoch 1:   4%|██                                                   | 1/26 [00:00<00:24,  1.03it/s, training_loss=0.441][A
Epoch 1:   4%|██                                                   | 1/26 [00:01<00:24,  1.03it/s, training_loss=0.350][A
Epoch 1:   8%|████                                                 | 2/26 [00:01<00:20,  1.15it

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2       val
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0       val
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0       val
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0       val
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2       val
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0       val
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0       val
5328      couldn't eat or drink  10037175      0       val
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:00<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1       val
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:00<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1       val
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1       val
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2       val
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0       val
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0       val
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:00<?, ?it/s, training_loss=0.347][A
Epoch 1:   4%|██                                                   | 1/26 [00:00<00:24,  1.04it/s, training_loss=0.347][A
Epoch 1:   4%|██                                                   | 1/26 [00:01<00:24,  1.04it/s, training_loss=0.355][A
Epoch 1:   8%|████                                                 | 2/26 [00:01<00:20,  1.18it

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2       val
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1       val
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0       val
5328      couldn't eat or drink  10037175      0       val
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0       val

[2075 rows x 4 columns]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:00<?, ?it/s, training_loss=0.395][A
Epoch 1:   4%|██                                                   | 1/26 [00:00<00:24,  1.03it/s, training_loss=0.395][A
Epoch 1:   4%|██                                                   | 1/26 [00:01<00:24,  1.03it/s, training_loss=0.357][A
Epoch 1:   8%|████                                                 | 2/26 [00:01<00:20,  1.15it

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0       val
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0       val
5332   Disturbed sleep patterns  10037175      0       val

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1       val
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0       val
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1       val
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1       val
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0       val

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:00<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0       val
4                         focus  10029205      2     train
5                          died  10018065      1       val
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0       val
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0     train
5332   Disturbed sleep patterns  10037175      0     train

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:00<?,

Combined DataFrame with 'data_type' column:
                             ade  soc_code  label data_type
3                            AD  10037175      0     train
4                         focus  10029205      2     train
5                          died  10018065      1     train
8                        dreams  10037175      0     train
10                   withdrawal  10018065      1     train
...                         ...       ...    ...       ...
5326  short term memory lacking  10037175      0     train
5328      couldn't eat or drink  10037175      0     train
5329              Could not eat  10037175      0     train
5331           can't eat normal  10037175      0       val
5332   Disturbed sleep patterns  10037175      0       val

[2075 rows x 4 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch Progress:   0%|                                                                           | 0/40 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                  | 0/26 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                             | 0/26 [00:01<?,