In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data import and cleaning up the data

In [None]:
df = pd.read_csv("/kaggle/input/crypto-regulations-yirifi/mappingtoBERT_6.csv")

In [None]:
df.shape

In [None]:
df.tail(10)

In [None]:
import re
# Function to check for URLs
def contains_url(text):
    text = str(text)
    return bool(re.search(r'http\S+|www\S+|https\S+', text))

# Function to check for user references (@user)
def contains_user_reference(text):
    text = str(text)
    return bool(re.search(r'\@\w+', text))

# Function to check for hashtags
def contains_hashtag(text):
    text = str(text)
    return bool(re.search(r'\#\w+', text))

# Function to check for non-alphanumeric characters
def contains_non_alphanumeric(text):
    text = str(text)
    return bool(re.search(r'[^a-zA-Z0-9\s]', text))

# Apply the functions to the DataFrame
df['contains_url'] = df['Content'].apply(contains_url)
df['contains_user_reference'] = df['Content'].apply(contains_user_reference)
df['contains_hashtag'] = df['Content'].apply(contains_hashtag)
df['contains_non_alphanumeric'] = df['Content'].apply(contains_non_alphanumeric)

# Check rows that match each condition
url_count = df['contains_url'].sum()
user_reference_count = df['contains_user_reference'].sum()
hashtag_count = df['contains_hashtag'].sum()
non_alphanumeric_count = df['contains_non_alphanumeric'].sum()

# Print results
print(f"Number of rows with URLs: {url_count}")
print(f"Number of rows with user references: {user_reference_count}")
print(f"Number of rows with hashtags: {hashtag_count}")
print(f"Number of rows with non-alphanumeric characters: {non_alphanumeric_count}")



In [None]:
df.tail(5)

In [None]:
df = df[["Content","bert_concepts"]]

In [None]:
import re
import string

def preprocess_text(text):
    text = str(text)    
    text = re.sub(r'<[^>]+>', '', text) # Remove HTML tags
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)# Remove URLs
    text = re.sub(r'\@\w+|\#','', text) # Remove user @ references and hashtags
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation    
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # Remove non-alphanumeric character    
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace(only single white space is left)
    
    return text

df['Content'] = df['Content'].apply(preprocess_text) # Applying the preprocess_text function


In [None]:
df.tail(5)

In [None]:
df.shape

In [None]:
missing_values_count = df['bert_concepts'].isnull().sum()
print("Number of rows with NaN in bert_concepts:", missing_values_count)

## Remove rows with no bert_concepts

In [None]:
df = df.dropna(subset=['bert_concepts'])

In [None]:
df.shape

# Converting to Bert Data Format

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [None]:
df.columns

In [None]:

df['bert_concepts'] = df['bert_concepts'].apply(lambda x: x.split(', ')) # make list of words
all_concepts = set(concept for row in df['bert_concepts'] for concept in row) # Create a list of all unique concepts


In [None]:
all_concepts

In [None]:
# One-hot encode each concept for each row
one_hot_encoded_rows = []
for index, row in df.iterrows():
    one_hot_row = {'Content': row['Content']}
    for concept in all_concepts:
        one_hot_row[concept] = True if concept in row['bert_concepts'] else False
    one_hot_encoded_rows.append(one_hot_row)


In [None]:
one_hot_encoded_rows[:2]

In [None]:
df = pd.DataFrame(one_hot_encoded_rows)

In [None]:
df.head(3)

In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
#Splitting DataFrame into train, test, and validation sets
train_df, test_valid_df = train_test_split(df, test_size=0.3, random_state=42)
test_df, validation_df = train_test_split(test_valid_df, test_size=0.33, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

# Create DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": validation_dataset
})




In [None]:
dataset

In [None]:
[i for i in dataset['train']][:2]

# Data Ready Now Training

In [None]:
!pip install -q transformers datasets

Add a linear layer on top of the base model, which is used to produce a tensor of shape (batch_size, num_labels), indicating the unnormalized scores for a number of labels for every example in the batch.

### List containing all the labels

In [None]:
labels = [label for label in dataset['train'].features.keys() if label not in ['Content','__index_level_0__']]
#Create 2 dictionaries that map labels to integers and back.
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

In [None]:
id2label

# Tokenization

### As models like BERT don't expect text as direct input, but rather input_ids, etc., we tokenize the text using the tokenizer. We will use AutoTokenizer API, which will automatically load the appropriate tokenizer based on the checkpoint on the hub.

### What's a bit tricky is that we also need to provide labels to the model. For multi-label text classification, this is a matrix of shape (batch_size, num_labels). Also important: this should be a tensor of floats rather than integers, otherwise PyTorch' BCEWithLogitsLoss (which the model will use) will complain

In [None]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

def preprocess_data(examples, max_length=128):
  # take a batch of texts
  text = examples['Content']
  batch_size = len(text)
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

In [None]:
encoded_dataset

In [None]:
encoded_dataset['train']

In [None]:
example = encoded_dataset['train'][0]
tokenizer.decode(example['input_ids'])

In [None]:
#example['labels']

In [None]:
#use id to label dictionary
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]
encoded_dataset.set_format("torch") 

### Here we define a model that includes a pre-trained base (i.e. the weights from bert-base-uncased) are loaded, with a random initialized classification head (linear layer) on top. One should fine-tune this head, together with the pre-trained base on a labeled dataset.

### We set the problem_type to be "multi_label_classification", as this will make sure the appropriate loss function is used (namely BCEWithLogitsLoss). We also make sure the output layer has len(labels) output neurons, and we set the id2label and label2id mappings

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                          ignore_mismatched_sizes=True )

In [None]:
batch_size = 8
metric_name = "f1"

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

we need to define a compute_metrics function, that returns a dictionary with the desired metric values.

source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds,labels=p.label_ids)
    return result
  
   


In [None]:
encoded_dataset['train'][0]['labels'].type()

In [None]:
encoded_dataset['train']['input_ids'][0]

In [None]:
# #forward pass
# #, attention_mask=attention_mask
# outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
# outputs

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer

In [None]:

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")


In [None]:
import os
os.environ['WANDB_API_KEY'] ="313a57558bcaee784e68d1654f7915a0b463a341"

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

# Inference
add Codeadd Markdown
The logits that come out of the model are of shape (batch_size, num_labels). As we are only forwarding a single sentence through the model, the batch_size equals 1.

The logits is a tensor that contains the (unnormalized) scores for every individual label.

The logits that come out of the model are of shape (batch_size, num_labels). As we are only forwarding a single sentence through the model, the batch_size equals 1.

The logits is a tensor that contains the (unnormalized) scores for every individual label.

In [None]:
def get_answer(text):
    
    encoding = tokenizer(text, return_tensors="pt")
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
    outputs = trainer.model(**encoding)
    logits = outputs.logits
    print(logits.shape)
    
    # apply sigmoid + threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    print(probs)
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= 0.5)] = 1
    print(predictions)
    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
    print()
    return predicted_labels


In [None]:
text = "The new licensing regime for centralised virtual asset trading platforms under the Anti-MoneyLaundering and Counter-Terrorist Financing Ordinance (Cap. 615) (AMLO) will come intoeffect on 1 June 2023. Under the new regime, centralised virtual asset trading platformsoperating in Hong Kong will need to apply to the Securities and Futures Commission (SFC)for a licence under the Securities and Futures Ordinance (Cap 571) (SFO) and/or the AMLO(Dual Licence Arrangement)"
get_answer(text)

# Probability

In [None]:
# outputs = trainer.model(**encoding)
# logits = outputs.logits
# logits.shape

The logits that come out of the model are of shape (batch_size, num_labels). As we are only forwarding a single sentence through the model, the batch_size equals 1.

The logits is a tensor that contains the (unnormalized) scores for every individual label

To turn them into actual predicted labels, we first apply a sigmoid function independently to every score, such that every score is turned into a number between 0 and 1, that can be interpreted as a "probability" for how certain the model is that a given class belongs to the input text.

Next, we use a threshold (typically, 0.5) to turn every probability into either a 1 (which means, we predict the label for the given example) or a 0 (which means, we don't predict the label for the given example)

In [None]:
# # apply sigmoid + threshold
# sigmoid = torch.nn.Sigmoid()
# probs = sigmoid(logits.squeeze().cpu())
# predictions = np.zeros(probs.shape)
# predictions[np.where(probs >= 0.5)] = 1
# # turn predicted id's into actual label names
# predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
# print(predicted_labels)