In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from torch import cuda
import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
device = 'cuda' if cuda.is_available() else 'cpu'
from sklearn.utils.class_weight import compute_class_weight

In [2]:
df = pd.read_csv('projects_clean_selection.csv')

possible_labels = df.category_name.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df['label'] = df.category_name.replace(label_dict)

y = df.label
weights = compute_class_weight(class_weight="balanced", classes=range(107), y=y)
weights = torch.tensor(weights).to(device, dtype = torch.float)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df.description.values, df.label.values, stratify=df.label.values)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

df_train = pd.DataFrame(x_train, columns=['text'])
df_train['label'] = y_train.tolist()
df_test = pd.DataFrame(x_test, columns=['text'])
df_test['label'] = y_test.tolist()

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [11]:
training_set = CustomDataset(df_train, tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased') # bert-base-uncased
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 107)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        
        return output

model = BERTClass()
model.to(device)

In [13]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss(weight = weights)(outputs, targets)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    loss_total_train = 0
    model.train()
    for _,data in enumerate(tqdm.tqdm(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)      
        
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        loss_total_train += loss.item()*TRAIN_BATCH_SIZE
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss_total_train

def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    loss_total_test = 0
    with torch.no_grad():
        for _, data in enumerate(tqdm.tqdm(testing_loader, 0)):
            
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.softmax(outputs, dim = 1).cpu().detach().numpy().tolist())
            loss = loss_fn(outputs, targets)
            loss_total_test += loss.item()*VALID_BATCH_SIZE
            
    return fin_outputs, fin_targets, loss_total_test

loss_dic_train = {}
loss_dic_test = {}
f1_score_micro_dict = {}
f1_score_macro_dict = {}
f1_score_classes_dict = {}

for epoch in range(EPOCHS):
    loss_total_train = train(epoch)
    outputs, targets, loss_total_test = validation(epoch)
    
    loss_epoch_train = loss_total_train/y_train.shape[0]
    loss_dic_train.update({epoch:loss_epoch_train})
    loss_epoch_test = loss_total_test/y_test.shape[0]
    loss_dic_test.update({epoch:loss_epoch_test})
    
    outputs = torch.Tensor(outputs)
    targets = torch.Tensor(targets)
    max_position = outputs.argmax (1)
    outputs_fin = torch.zeros (outputs.shape).scatter (1, max_position.unsqueeze (1), 1.0)
    accuracy = metrics.accuracy_score(targets, outputs_fin)
    f1_score_micro = metrics.f1_score(targets, outputs_fin, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs_fin, average='macro')
    f1_score_classes = metrics.f1_score(targets, outputs_fin, average=None)
    
    f1_score_micro_dict.update({epoch:f1_score_micro})
    f1_score_macro_dict.update({epoch:f1_score_macro})
    f1_score_classes_dict.update({epoch:f1_score_classes})

    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"F1 Score (Classes) = {f1_score_classes}")