In [10]:
import torch
from transformers import BertTokenizer


model = "base" # bert model type
MAX_LEN = 128 # maximum lenght of token for bert tokenizer
PRE_TRAINED_MODEL = "bert-base-uncased" if model =='base' else "bert-large-uncased" # bert pretrained nodel
NUM_LABELS = 23 # Numbers of labels to predict
MODEL_PATH = "bert_model.pth.tar" # save model path
ENCODER_PATH = "encoder.pkl" # save label encoder path
LEARNING_RATE = 1e-04 if model == "base" else 1e-05 # learning rate training
EPOCHS = 5 # no. of epochs for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device support for training
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL) # tokenizer for converting text to numercial values

In [11]:
import pandas as pd 
import pickle 
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# remove null labels 
def remove_empty(text):
    return [lab for lab in text if lab != '']

# remove extra spaces from labels
def remove_space(text):
    return [lab.strip() for lab in text]

# to replace duplicate labels with correct labels
def replace_label(df, src, trg):
    def replace(texts):
        return [lab if lab != src else trg for lab in texts]
    
    df['target'] = df['target'].map(replace)

# to get all noisy labels that don't have any sentiments 
def get_noisy_labels(df):
    noisy_labels = []
    for label,count in Counter(df.target.explode()).items():
        if pd.isna(label):
            continue
        if count < 5:
            if 'positive' not in label.split():
                if 'negative' not in label.split():
                    noisy_labels.append(label)

    return noisy_labels

# to remove nosiy labels from the dataframe
def remove_noisy_labels(df):
    print("Removing noisy labels...")
    df = df.drop([384]) # outlier with 14 labels 
    noisy_labels = get_noisy_labels(df)
    for i in range(len(df)):
        for nLabel in noisy_labels:
            if nLabel in df.iloc[i,1]:
                df.iloc[i,1].remove(nLabel)
    
    # to remove datapoints that doesn't have any labels 
    df = df[df["target"].str.len() != 0]

    return df 

# combine labels that have very low frequency to a single label based on threshold
def combine_labels(df,min_samples = 100):
    print("Combining labels...")
    label_counts = df.target.explode().value_counts()
    label_names = label_counts.index
    
    fewer_labels = []
    for i,label in enumerate(label_names):
        if label_counts[i] < min_samples:
            fewer_labels.append(label)
    
    def replace_fewer(labels):
        fewers = []
        for label in labels:
            sentiment = label.split(' ')[-1]
            if label in fewer_labels:
                fewers.append(' '.join(['extra',sentiment]))
            else:
                fewers.append(label)
                
        return fewers 
    
    df['target'] = df['target'].map(replace_fewer)  

    return df

# undesample very frequent labels 
def undersample_labels(df, labels, frac):
    udf = df[df.target.apply(lambda x: x == labels)]
    indexesToDrop = udf.index.values
    underSampleLabel = udf.sample(frac = frac)
    df = df.drop(indexesToDrop)
    df = pd.concat([df, underSampleLabel])
    return df

# encode labels for training
def encode_labels(df):
    print("Encoding Labels...")
    le = MultiLabelBinarizer()
    df['encoded'] = le.fit_transform(df.target.tolist()).tolist()
    df = df[['text','encoded']]

    encoder = open('encoder.pkl', 'ab') 
    pickle.dump(le, encoder)                      
    encoder.close()
    
    return df 

# splitting the dataset and saving it 
def split_and_save(df, split_size = 0.2):
    df_train, df_test = train_test_split(df, test_size=split_size)
    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    df_train.to_pickle('train.pkl')
    df_test.to_pickle('test.pkl')
    print("Preprocessed and Saved...")
    
    return True

# loading the dataframe and doing basic preprocessing
def loader(dfPath):
    df = pd.read_csv(dfPath,header = None)
    df = df.fillna('')
    
    # to get all the multi labels in one column
    columns = ['text']
    labels = []
    
    for idx in range(1, 15):
        name = 'label_' + str(idx)
        labels.append(name)
        columns.append(name)

    df.columns = columns

    df['target'] = df[labels].values.tolist()

    df['target'] = df['target'].map(remove_empty)
    df['target'] = df['target'].map(remove_space)

    df =  df[['text','target']]

    # replacing labels that are similar but have some spelling mistakes
    replace_label(df, 'advisor/agent service positive','advisoragent service positive')
    replace_label(df, 'advisor/agent service negative','advisoragent service negative')
    replace_label(df, 'tyre age/dot code negative','tyre agedot code negative')
    
    # removing noisy labels 
    df = remove_noisy_labels(df)
    # combining labels that have frequence less than 100
    df = combine_labels(df)
    # undersampling high frequency labels datapoints 
    df = undersample_labels(df, ['value for money positive'], 0.1)
    df = undersample_labels(df, ['garage service positive'], 0.2)
    df = undersample_labels(df, ['value for money positive','garage service positive'], 0.2)
    # encoding the labels for training
    df = encode_labels(df)

    return df 




In [12]:
df = loader(r"C:\Users\amolk\OneDrive\Desktop\Asha_assign\Evaluation-dataset.csv")
split_and_save(df)

Removing noisy labels...
Combining labels...
Encoding Labels...
Preprocessed and Saved...


  if label_counts[i] < min_samples:


True

In [13]:
import torch
import pandas as pd
from torch.utils.data import Dataset

# Pytorch dataset for the training of BERT model
class SentimentDataset(Dataset):
    def __init__(self, df_path, tokenizer, max_len):
        self.df = pd.read_pickle(df_path)
        self.texts = self.df.text 
        self.targets = self.df.encoded
        self.max_len = max_len 
        self.tokenizer = tokenizer

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self,index):
        text = self.texts[index]
        target = self.targets[index]
        # encoding the texts with pretrained bert tokenizer
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [14]:
from torch.utils.data import DataLoader

# create datasets from preprocessed train and test files and then return data loaders for both
def get_loader(train_batch_size=32, test_batch_size = 8, shuffle=True, num_workers=8, pin_memory=True):
    trainDataset = SentimentDataset('train.pkl', tokenizer, MAX_LEN)
    testDataset = SentimentDataset('test.pkl', tokenizer, MAX_LEN)

    trainLoader = DataLoader(
        dataset=trainDataset,
        batch_size=train_batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    testLoader = DataLoader(
        dataset=testDataset,
        batch_size=test_batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    return trainLoader, testLoader, tokenizer

In [15]:
import torch
from transformers import BertModel, BertPreTrainedModel, BertTokenizer

# Bert Pretrained model with final classifier 
class SentimentMultilabel(BertPreTrainedModel):
    def __init__(self, num_labels, conf):
        super(SentimentMultilabel, self).__init__(conf)
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL)
        self.drop = torch.nn.Dropout(0.4)
        self.classifier = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        output = self.drop(pooled_output)
        output = self.classifier(output)
        return output

# Bert Large Pretrained model with final classifier 
class SentimentMultilabelLarge(BertPreTrainedModel):
    def __init__(self, num_labels, conf):
        super(SentimentMultilabelLarge, self).__init__(conf)
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL)
        self.drop = torch.nn.Dropout(0.4)
        self.classifier = torch.nn.Linear(1024, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        output = self.drop(pooled_output)
        output = self.classifier(output)
        return output

In [16]:
import torch
import numpy as np
import pickle
from transformers import BertTokenizer, BertConfig
                               
model = "base"
device = device
model_config = BertConfig()
num_labels = NUM_LABELS
lr = LEARNING_RATE
epochs = EPOCHS

# evaluation metrics data, for analysis of models performance
eval_metrics = {
            "epochs": [],
            "train_loss": [],
            "val_loss": [],
            "training_f1_micro": [],
            "training_f1_macro": [],
            "val_f1_micro": [],
            "val_f1_macro": [],
            "training_hamming_loss": [],
            "val_hamming_loss": [],
        }

# Binary cross entropy with logits loss function
def loss_fun(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

# function to train the model 
def train(model_name):
    model = SentimentMultilabel(num_labels,model_config).to(device) if model_name == "base" else SentimentMultilabelLarge(num_labels,model_config).to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

    # creating the training and validation data loaders 
    trainLoader, testLoader, _ = get_loader()

    for epoch in range(1,epochs+1):
        eval_metrics["epochs"].append(epoch)
        model.train()
        epoch_loss = 0
        # training actual and prediction for each epoch for printing metrics
        train_targets = []
        train_outputs = []
        for _, data in enumerate(trainLoader):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_fun(outputs, targets)
            epoch_loss = loss.item()
            train_targets.extend(targets.cpu().detach().numpy().tolist())
            train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            if _ % 50 == 0:
                print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # calculating the evaluation scores for both training and validation data
        train_f1_micro, train_f1_macro, train_hamming,train_loss = print_metrics(train_targets,train_outputs,epoch_loss, 'Training')
        val_f1_micro, val_f1_macro, val_hamming, val_loss = validate(model, testLoader)
        eval_metrics['training_f1_micro'].append(train_f1_micro)
        eval_metrics['training_f1_macro'].append(train_f1_macro)
        eval_metrics['training_hamming_loss'].append(train_hamming)
        eval_metrics['val_f1_micro'].append(val_f1_micro)
        eval_metrics['val_f1_macro'].append(val_f1_macro)
        eval_metrics['val_hamming_loss'].append(val_hamming)
        eval_metrics["train_loss"].append(train_loss)
        eval_metrics["val_loss"].append(val_loss)
    
    # saving the metrics and trained model for inference and model analysis
    save_metrics(eval_metrics,'bert_base' if model == 'base' else 'bert_large')
    checkpoint = {"state_dict": model.state_dict()}
    save_checkpoint(checkpoint)
    return True

In [17]:
import torch 
import pickle
import numpy as np
from sklearn import metrics

# Function to take true and predicted labels and calculate and print multiple metrics
def print_metrics(true, pred, loss, type):
    pred = np.array(pred) >= 0.35
    hamming_loss = metrics.hamming_loss(true,pred)
    precision_micro = metrics.precision_score(true, pred, average='micro',zero_division = 1)
    recall_micro = metrics.recall_score(true, pred, average='micro',zero_division = 1)
    precision_macro = metrics.precision_score(true, pred, average='macro',zero_division = 1)
    recall_macro = metrics.recall_score(true, pred, average='macro',zero_division = 1)
    f1_score_micro = metrics.f1_score(true, pred, average='micro',zero_division = 1)
    f1_score_macro = metrics.f1_score(true, pred, average='macro',zero_division = 1)
    print("-------{} Evaluation--------".format(type))
    print("BCE Loss: {:.4f}".format(loss))
    print("Hamming Loss: {:.4f}".format(hamming_loss))
    print("Precision Micro: {:.4f}, Recall Micro: {:.4f}, F1-measure Micro: {:.4f}".format(precision_micro, recall_micro, f1_score_micro))
    print("Precision Macro: {:.4f}, Recall Macro: {:.4f}, F1-measure Macro: {:.4f}".format(precision_macro, recall_macro, f1_score_macro))
    print("------------------------------------")
    return f1_score_micro, f1_score_macro, hamming_loss, loss 

# fucntion to save the metrics for model analysis 
def save_metrics(eval_metrics,file_name):
    eval = open('{}_metrics.pkl'.format(file_name), 'ab') 
    pickle.dump(eval_metrics, eval)                      
    eval.close()
    return True

# fucntion to save the model for inference
def save_checkpoint(state, filename=MODEL_PATH):
    print("=> Saving Model")
    torch.save(state, filename)

In [18]:
import torch
device = device

def loss_fun(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

# function to validate the validation data from trained model
def validate(model, testLoader):
    model.eval()
    val_targets = []
    val_outputs = []
    with torch.no_grad():
        for _, data in enumerate(testLoader):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fun(outputs, targets)
            epoch_loss = loss.item()
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        return print_metrics(val_targets,val_outputs, epoch_loss,'Validation')

In [19]:
train("base")