In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
import torch.nn as nn
import json
import tqdm.notebook as tq
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3080


In [None]:
path = "./data/raw/train_for_student.json"
df = pd.read_json(path, orient='index')

In [4]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 454 entries, 1 to 454
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     454 non-null    object
 1   Abstract  454 non-null    object
 2   Classes   454 non-null    object
dtypes: object(3)
memory usage: 14.2+ KB


Unnamed: 0,Title,Abstract,Classes
1,Activated carbon derived from bacterial cellul...,© 2019 Elsevier B.V.Activated carbon derived f...,"[CHE, MATENG]"
2,The algorithm of static hand gesture recogniti...,© Springer International Publishing AG 2018.Te...,[CPE]
3,Alternative Redundant Residue Number System Co...,© 2018 IEEE.Residue number system (RNS) is a n...,[EE]
4,Comparative study of wax inhibitor performance...,© Published under licence by IOP Publishing Lt...,"[PE, ME, CHE]"
5,Undrained lower bound solutions for end bearin...,"© 2019 John Wiley & Sons, Ltd.The undrained be...","[CE, MATSCI]"


In [5]:
df['Title'][1]

'Activated carbon derived from bacterial cellulose and its use as catalyst support for ethanol conversion to ethylene'

In [6]:
df['Abstract'][1]

'© 2019 Elsevier B.V.Activated carbon derived from bacterial cellulose (BC-AC) was modified with various amounts of H3PO4(x wt% P/BC-AC) and used as a catalyst for the selective dehydration of ethanol to ethylene. The BC-AC obtained at a carbonization temperature of 500 °C had a mesoporous structure with surface area and total pore volume of ~1730 m2/g and 1.0 cm3/g, respectively. An increase in the H3PO4 loading from 5% to 40% increased the number of weak acid sites on the catalyst surface, which consequently enhanced ethanol conversion. At the reaction temperature of 400 °C, the modified BC-AC with 30-40 wt% H3PO4 loading (P/BC-AC) gave an ethanol conversion at 100% and an ethylene selectivity of 100%. A high selectivity for diethyl ether (DEE) at ~ 67% at ethanol conversion of ~ 50% was obtained at 200 °C. Stability tests with a time-on-stream of 12 h, at reaction temperatures of 200 and 400 °C, showed that the P/BC-AC catalyst had high thermal stability and stable catalytic activit

In [7]:
df["Combined"] = df["Title"] + ". " + df["Abstract"]
df = df.drop(columns=["Abstract", "Title"], axis=1)

In [8]:
df.head()

Unnamed: 0,Classes,Combined
1,"[CHE, MATENG]",Activated carbon derived from bacterial cellul...
2,[CPE],The algorithm of static hand gesture recogniti...
3,[EE],Alternative Redundant Residue Number System Co...
4,"[PE, ME, CHE]",Comparative study of wax inhibitor performance...
5,"[CE, MATSCI]",Undrained lower bound solutions for end bearin...


In [9]:
df['Combined'][1]

'Activated carbon derived from bacterial cellulose and its use as catalyst support for ethanol conversion to ethylene. © 2019 Elsevier B.V.Activated carbon derived from bacterial cellulose (BC-AC) was modified with various amounts of H3PO4(x wt% P/BC-AC) and used as a catalyst for the selective dehydration of ethanol to ethylene. The BC-AC obtained at a carbonization temperature of 500 °C had a mesoporous structure with surface area and total pore volume of ~1730 m2/g and 1.0 cm3/g, respectively. An increase in the H3PO4 loading from 5% to 40% increased the number of weak acid sites on the catalyst surface, which consequently enhanced ethanol conversion. At the reaction temperature of 400 °C, the modified BC-AC with 30-40 wt% H3PO4 loading (P/BC-AC) gave an ethanol conversion at 100% and an ethylene selectivity of 100%. A high selectivity for diethyl ether (DEE) at ~ 67% at ethanol conversion of ~ 50% was obtained at 200 °C. Stability tests with a time-on-stream of 12 h, at reaction te

In [10]:
import nltk
import re

nltk.download("stopwords")
from nltk.corpus import stopwords

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    - Remove digits
    - Remove word with length <= 2
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                if word not in stopwords.words('english')
                or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    # Remove digits
    s = re.sub(r'\d+', '', s)
    # Remove word with length <= 2
    s = re.sub(r'\b\w{1,2}\b', '', s)

    return s

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iHC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df['Combined'] = df['Combined'].apply(text_preprocessing)

In [12]:
df

Unnamed: 0,Classes,Combined
1,"[CHE, MATENG]",activated carbon derived bacterial cellulose u...
2,[CPE],algorithm static hand gesture recognition usin...
3,[EE],alternative redundant residue number system co...
4,"[PE, ME, CHE]",comparative study wax inhibitor performance po...
5,"[CE, MATSCI]",undrained lower bound solutions end bearing ca...
...,...,...
450,"[CPE, CHE]",portable usb controlled potentiostat paper bas...
451,"[CPE, EDU]",literature reviews applying artificial intelli...
452,"[ENV, EE, CHE]",multi parameterized water quality prediction m...
453,"[EE, CPE, OPTIC, EDU]",semantic segmentation medium resolution satell...


In [13]:
df['Combined'][1]

'activated carbon derived bacterial cellulose use catalyst support ethanol conversion ethylene  elsevier   activated carbon derived bacterial cellulose   modified various amounts hpo      used catalyst selective dehydration ethanol ethylene   obtained carbonization temperature   mesoporous structure surface area total pore volume        respectively increase hpo loading   increased number weak acid sites catalyst surface consequently enhanced ethanol conversion reaction temperature   modified      hpo loading    gave ethanol conversion  ethylene selectivity  high selectivity diethyl ether dee  ethanol conversion  obtained   stability tests time stream   reaction temperatures    showed    catalyst high thermal stability stable catalytic activity therefore    found effective inexpensive environmentally friendly catalyst ethylene production via ethanol dehydration'

In [None]:
df.to_json('./data/processed/preprocessed.json', orient='records')

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df['Classes']).astype('float32')
texts = df['Combined'].to_list()

In [None]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
texts

['activated carbon derived bacterial cellulose use catalyst support ethanol conversion ethylene  elsevier   activated carbon derived bacterial cellulose   modified various amounts hpo      used catalyst selective dehydration ethanol ethylene   obtained carbonization temperature   mesoporous structure surface area total pore volume        respectively increase hpo loading   increased number weak acid sites catalyst surface consequently enhanced ethanol conversion reaction temperature   modified      hpo loading    gave ethanol conversion  ethylene selectivity  high selectivity diethyl ether dee  ethanol conversion  obtained   stability tests time stream   reaction temperatures    showed    catalyst high thermal stability stable catalytic activity therefore    found effective inexpensive environmentally friendly catalyst ethylene production via ethanol dehydration',
 'algorithm static hand gesture recognition using rule based classification springer international publishing   technology 

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", problem_type="multi_label_classification", num_labels=18)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
MAX_LEN = 512

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])

        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

In [None]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

In [None]:
from sklearn.metrics import f1_score
from transformers import EvalPrediction

def compute_f1(p:EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(preds))

    y_pred = np.zeros(probs.shape)
    y_pred[probs>=0.3] = 1

    f1 = f1_score(p.label_ids, y_pred, average = 'macro')

    return {"f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import ParameterGrid

hyperparameter_grid = {
    'learning_rate': [5e-5, 7e-5, 1e-10],
    'num_train_epochs': [3],
    'per_device_train_batch_size': [8]
}

# Perform grid search
best_f1 = 0
best_hyperparameters = None

for params in ParameterGrid(hyperparameter_grid):
    print("Training with hyperparameters:", params)

    args = TrainingArguments(
        output_dir='./results',
        seed=42,
        **params
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_f1
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    f1 = trainer.evaluate()['eval_f1']
    print("f1:", f1)

    # Update best f1 and hyperparameters
    if f1 > best_f1:
        best_f1 = f1
        best_hyperparameters = params

print("Best hyperparameters:", best_hyperparameters)
print("Best f1:", best_f1)


Training with hyperparameters: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8}


  0%|          | 0/138 [00:00<?, ?it/s]

{'train_runtime': 38.0528, 'train_samples_per_second': 28.618, 'train_steps_per_second': 3.627, 'train_loss': 0.003962676162305085, 'epoch': 3.0}


  0%|          | 0/12 [00:00<?, ?it/s]

f1: 0.5338142484083398
Training with hyperparameters: {'learning_rate': 7e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8}


  0%|          | 0/138 [00:00<?, ?it/s]

{'train_runtime': 38.8244, 'train_samples_per_second': 28.049, 'train_steps_per_second': 3.554, 'train_loss': 0.004616742980652961, 'epoch': 3.0}


  0%|          | 0/12 [00:00<?, ?it/s]

f1: 0.5460399868397003
Training with hyperparameters: {'learning_rate': 1e-10, 'num_train_epochs': 3, 'per_device_train_batch_size': 8}


  0%|          | 0/138 [00:00<?, ?it/s]

{'train_runtime': 37.9466, 'train_samples_per_second': 28.698, 'train_steps_per_second': 3.637, 'train_loss': 0.0029116093680478525, 'epoch': 3.0}


  0%|          | 0/12 [00:00<?, ?it/s]

f1: 0.5460399868397003
Best hyperparameters: {'learning_rate': 7e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8}
Best f1: 0.5460399868397003


In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./output',
    num_train_epochs=20,
    learning_rate=7e-05,
    seed=42
)

trainer = Trainer(model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_f1)

In [None]:
trainer.train()

  0%|          | 0/920 [00:00<?, ?it/s]

Checkpoint destination directory ./results\checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.012, 'grad_norm': 0.030342604964971542, 'learning_rate': 3.195652173913043e-05, 'epoch': 10.87}
{'train_runtime': 235.3356, 'train_samples_per_second': 30.85, 'train_steps_per_second': 3.909, 'train_loss': 0.008801759844240935, 'epoch': 20.0}


TrainOutput(global_step=920, training_loss=0.008801759844240935, metrics={'train_runtime': 235.3356, 'train_samples_per_second': 30.85, 'train_steps_per_second': 3.909, 'train_loss': 0.008801759844240935, 'epoch': 20.0})

In [None]:
trainer.evaluate()

  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.44589316844940186,
 'eval_f1': 0.5623423482942462,
 'eval_runtime': 1.1423,
 'eval_samples_per_second': 79.663,
 'eval_steps_per_second': 10.505,
 'epoch': 20.0}

In [None]:
trainer.save_model("./outputs/models/bert_model")

In [None]:
def predict(text):
    text = text_preprocessing(text)
    encoding = tokenizer(text, return_tensors='pt')
    encoding.to(trainer.model.device)

    outputs = trainer.model(**encoding)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(outputs.logits[0].cpu())
    preds = np.zeros(probs.shape)
    preds[np.where(probs>=0.32)] = 1


    return set(multilabel.inverse_transform(preds.reshape(1,-1))[0])

In [None]:
df

Unnamed: 0,Classes,Combined
1,"[CHE, MATENG]",activated carbon derived bacterial cellulose u...
2,[CPE],algorithm static hand gesture recognition usin...
3,[EE],alternative redundant residue number system co...
4,"[PE, ME, CHE]",comparative study wax inhibitor performance po...
5,"[CE, MATSCI]",undrained lower bound solutions end bearing ca...
...,...,...
450,"[CPE, CHE]",portable usb controlled potentiostat paper bas...
451,"[CPE, EDU]",literature reviews applying artificial intelli...
452,"[ENV, EE, CHE]",multi parameterized water quality prediction m...
453,"[EE, CPE, OPTIC, EDU]",semantic segmentation medium resolution satell...


In [None]:
print(predict(df['Combined'][1]))

{'MATENG', 'CHE'}


In [None]:
df2 = pd.read_json("test_for_student.json", orient='index')
df2["Combined"] = df2["Title"] + ". " + df2["Abstract"]
df2 = df2.drop(columns=["Abstract", "Title"], axis=1)

In [None]:
df2

Unnamed: 0,Combined
001eval,Comparative Electrical Energy Yield Performanc...
002eval,Effects of graphene nanoplatelets on bio-based...
003eval,Anti-inflammatory action of two novel peptides...
004eval,Efficient all-and-one support vector machines ...
005eval,Driver identification using histogram and neur...
...,...
147eval,Utilization of Sewage Sludge from Beverage Ind...
148eval,Development of a Gateway for OpenADR-ECHONET L...
149eval,Effect of solution treatment and precipitation...
150eval,An effect-analysis method for species-dependen...


In [None]:
l = ['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE', 'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']
t = ",".join(l)

In [None]:
print(l)
print(t)

['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE', 'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']
CE,ENV,BME,PE,METAL,ME,EE,CPE,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI


In [None]:
file_path = "./outputs/predictions/predictions.csv"

# write file
with open(file_path, 'w') as f:
    f.write("id, " + t + "\n")
    for index, row in df2.iterrows():
        text = index
        out = predict(row["Combined"])
        for i in l:
            if i in out:
                text += ",1"
            else:
                text += ",0"
        text = text
        f.write(text + "\n")

f.close()
