In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Importing libraries

In [None]:
!pip install transformers

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import shutil
import sys
from sklearn.model_selection import train_test_split

# Setting seed for reproducibility

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
set_seed(43)

# Downloading dataset

In [None]:
from requests import get as rget

# Update URL to raw CSV file
url = "https://raw.githubusercontent.com/Fal186/Mapping-web3/refs/heads/main/dataset/web3_Stage1_industry_domain.csv"

res = rget(url)
with open('file.csv', 'wb+') as f:
    f.write(res.content)

data_df = pd.read_csv('file.csv')

In [None]:
data_df.info()

In [None]:
data_df.columns

# Selecting required columns

In [None]:
train_df = data_df[['text', 'Core Infrastructure & Protocols',
       'Decentralized Finance (DeFi) & Financial Applications',
       'Digital Assets & Collectibles (NFTs)',
       'Decentralized Applications (DApps) & Gaming',
       'DAO & Governance Mechanisms']]

In [None]:
target_list = ['Core Infrastructure & Protocols',
       'Decentralized Finance (DeFi) & Financial Applications',
       'Digital Assets & Collectibles (NFTs)',
       'Decentralized Applications (DApps) & Gaming',
       'DAO & Governance Mechanisms']

# Preparing the tokenizer

In [None]:
#Set Max Lenght, maksimal 512 (BERT)
MAX_LEN = 512

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
#download the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['text']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

# Splitting & Tokenizing Dataset

In [None]:
# Adjusting the train/validation/test split
train_df, temp_df = train_test_split(data_df, test_size=0.2, random_state=43)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=43)

# Reset the indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

##Checking your device


In [None]:
# Checking for available device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

## Adding Class Weights to adjust class imbalance

In [None]:
# Calculate class weights based on the label distribution in the training data
label_counts = train_df[target_list].sum(axis=0)
total_counts = label_counts.sum()
class_weights = total_counts / (len(target_list) * label_counts)
class_weights_tensor = torch.tensor(class_weights.values).float().to(device)

In [None]:
# Print class weights
print("Class Weights Distribution:")
for label, weight in zip(target_list, class_weights):
    print(f"{label}: {weight:.4f}")

In [None]:
# Define the loss function with class weights
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor)



##Label Distribution

In [None]:
# Label distribution in the training set
train_counts = train_df[target_list].sum(axis=0)
print("Label distribution in the training set:\n", train_counts)

# Label distribution in the validation set
val_counts = val_df[target_list].sum(axis=0)
print("\nLabel distribution in the validation set:\n", val_counts)

# Label distribution in the test set
test_counts = test_df[target_list].sum(axis=0)
print("\nLabel distribution in the test set:\n", test_counts)

In [None]:
train_texts = set(train_df['text'])
val_texts = set(val_df['text'])
test_texts = set(test_df['text'])

overlap = train_texts.intersection(val_texts).union(train_texts.intersection(test_texts)).union(val_texts.intersection(test_texts))
print(f"Number of overlapping samples across splits: {len(overlap)}")


In [None]:
# Label distribution in the training set
train_counts_percentage = (train_df[target_list].sum(axis=0) / len(train_df)) * 100
print("Label distribution in the training set:\n", train_counts_percentage)

# Label distribution in the validation set
val_counts_percentage = (val_df[target_list].sum(axis=0) / len(val_df)) * 100
print("\nLabel distribution in the validation set:\n", val_counts_percentage)

# Label distribution in the test set
test_counts_percentage = (test_df[target_list].sum(axis=0) / len(test_df)) * 100
print("\nLabel distribution in the test set:\n", test_counts_percentage)

In [None]:
train_df.shape

In [None]:
val_df.shape

In [None]:
val_df

In [None]:
test_df

In [None]:
# Create the CustomDataset for each set
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)

In [None]:
len(train_dataset)

# Setting hyperparameters

In [None]:
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 3e-5

In [None]:
# Preparing the DataLoaders
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

# Additional functions for loading and saving checkpoints

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

# Training the Model

Defining and Initializing the BERT Classification Model

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('google-bert/bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Setting Up the Loss Function and Optimizer

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

Initialization of Validation Target and Output Lists

In [None]:
val_targets=[]
val_outputs=[]

Training and Validation Loop with Early Stopping

In [None]:
import os

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    # Save checkpoint
    torch.save(state, checkpoint_path)
    # If it is the best model, save it separately
    if is_best:
        torch.save(state['state_dict'], best_model_path)
        print(f"Best model saved to {best_model_path}")

def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path, patience):

    # Initialize tracker for minimum validation loss
    valid_loss_min = np.inf
    no_improve = 0

    for epoch in range(1, n_epochs + 1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print(f'############# Epoch {epoch}: Training Start   #############')
        for batch_idx, data in enumerate(training_loader):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += ((1 / (batch_idx + 1)) * (loss.item() - train_loss))

        print(f'############# Epoch {epoch}: Training End     #############')

        print(f'############# Epoch {epoch}: Validation Start   #############')
        model.eval()
        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader, 0):
                ids = data['input_ids'].to(device, dtype=torch.long)
                mask = data['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
                targets = data['targets'].to(device, dtype=torch.float)
                outputs = model(ids, mask, token_type_ids)

                loss = loss_fn(outputs, targets)
                valid_loss += ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))

        train_loss /= len(training_loader)
        valid_loss /= len(validation_loader)
        print(f'Epoch: {epoch} \tAverage Training Loss: {train_loss:.6f} \tAverage Validation Loss: {valid_loss:.6f}')

        checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }

        # Save the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print(f'Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss
            no_improve = 0
        else:
            no_improve += 1
            print(f"No improvement in validation loss for {no_improve} epoch(s).")

        # Early stopping
        if no_improve >= patience:
            print("Early stopping due to no improvement in validation loss.")
            break

    return model


In [None]:
import os

# Ensure directories exist
os.makedirs("/content/gdrive/MyDrive/ckpt_path", exist_ok=True)
os.makedirs("/content/gdrive/MyDrive/Best_Model", exist_ok=True)

# Define paths for checkpoint and best model
ckpt_path = "/content/gdrive/MyDrive/Best_Model/best_model(32-3e-5-stage1-BERT-class-weightsv2).pth"
best_model_path = "/content/gdrive/MyDrive/ckpt_path/ckpth(32-3e-5-stage1-BERT-class-weightsv2).pth"



# Start Train

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path, patience=2)

In [None]:
# Change in ipython-input-34-9f9b812337c3
# Load the saved model's state_dict directly instead of using load_ckp
model.load_state_dict(torch.load(best_model_path))

# Print a message indicating successful loading
print(f'Loaded model state_dict from {best_model_path}')

# Now, if you need the validation loss (valid_loss_min) or start epoch:
# you should load from the checkpoint file (ckpt_path) instead
checkpoint = torch.load(ckpt_path)
start_epoch = checkpoint['epoch']
valid_loss_min = checkpoint['valid_loss_min']

print(f'The validation loss of the best saved model is: {valid_loss_min}')

# Test

In [None]:
# Process new dataset
#new_df =pd.read_excel("/content/combined_dataset_TEST_NEW.xlsx")
#new_dataset = CustomDataset(new_df, tokenizer, MAX_LEN)
new_dataset = test_dataset

# Create DataLoader
new_data_loader = torch.utils.data.DataLoader(new_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

# Load the model
model.load_state_dict(torch.load(best_model_path))
checkpoint = torch.load(ckpt_path)
start_epoch = checkpoint['epoch']
valid_loss_min = checkpoint['valid_loss_min']

# Switch model to the evaluation mode
model.eval()

new_outputs = []
new_targets = []
test_loss = 0.0

# Define loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

# Pass new data through the model
with torch.no_grad():
    for batch_idx, data in enumerate(new_data_loader):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        # Calculate loss
        loss = loss_fn(outputs, targets)
        test_loss += loss.item() * data['input_ids'].size(0)

        new_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        new_targets.extend(targets.cpu().detach().numpy().tolist())

# Average the test loss over all batches
test_loss = test_loss / len(new_data_loader.dataset)

print(f'Test Loss: {test_loss:.6f}')

In [None]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

# Convert the outputs and targets to numpy arrays
new_outputs_np = np.array(new_outputs)
new_targets_np = np.array(new_targets)

# Threshold the outputs (This depends on your requirements, 0.5 is used as an example)
new_outputs_bin = (new_outputs_np > 0.5)

# Calculate metrics
print(classification_report(new_targets_np, new_outputs_bin))

# Calculate macro and micro metrics
precision_macro = precision_score(new_targets_np, new_outputs_bin, average='macro')
recall_macro = recall_score(new_targets_np, new_outputs_bin, average='macro')
f1_macro = f1_score(new_targets_np, new_outputs_bin, average='macro')

precision_micro = precision_score(new_targets_np, new_outputs_bin, average='micro')
recall_micro = recall_score(new_targets_np, new_outputs_bin, average='micro')
f1_micro = f1_score(new_targets_np, new_outputs_bin, average='micro')

print(f'Macro Precision: {precision_macro} Macro Recall: {recall_macro} Macro F1: {f1_macro}')
print(f'Micro Precision: {precision_micro} Micro Recall: {recall_micro} Micro F1: {f1_micro}')

In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(new_targets_np, new_outputs_bin)

print(f'Accuracy: {accuracy}')

# Test with New Input Text

In [None]:
def classify_text(model, text, tokenizer, max_len, threshold=0.5):
    # Prepare the text
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)

    # Get the model outputs
    with torch.no_grad():
        outputs = model(input_ids, attention_mask, token_type_ids)

    # Convert to probabilities
    probabilities = torch.sigmoid(outputs).cpu().detach().numpy().tolist()

    # Define the class labels in the same order that the model was trained on
    class_labels =  ['Core Infrastructure & Protocols', 'Decentralized Finance (DeFi) & Financial Applications', 'Digital Assets & Collectibles (NFTs)', 'Decentralized Applications (DApps) & Gaming', 'DAO & Governance Mechanisms']


    # Convert the probabilities to labels
    predicted_labels = [class_labels[i] for i, prob in enumerate(probabilities[0]) if prob > threshold]

    return probabilities, predicted_labels


In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)