In [225]:
from configs import CONFIG
import pandas as pd
import os 
import string 
from sklearn.model_selection import  StratifiedKFold
import re 
import torch
from tqdm import tqdm

# path = "../datasets/IMDB Dataset.csv"
# df = pd.read_csv(path)

In [92]:
index2word = {idx: word for idx, word in enumerate(set(df["sentiment"].values))}
word2index = {word: idx for idx, word in index2word.items()}

df["label"] = df["sentiment"].apply(lambda x: word2index[x])

In [58]:
def preprocessing(sentence):
    sentence = sentence.translate(str.maketrans("","", string.punctuation)).lower()
    #let's remove if their any links 
    sentence = re.sub(r"https?://\s+", "", sentence)
    sentence = re.sub(r"\b\d+\b",  "", sentence)
    sentence = re.sub(r" +"," ",sentence)
    return sentence
df["preprocessedREVIEW"] = df["review"].apply(preprocessing)

In [67]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [202]:
class dataset:
    def __init__(self, sentence, label) -> None:
        self.sentences = sentence
        self.labels  = label
        self.window_size = 510
    def _getpadding(self, input_ids, attention_mask):
        # let's add the special character 
        input_ids = [0] + input_ids.cpu().tolist() if isinstance(input_ids, torch.Tensor) else input_ids +[2]
        attention_mask = [1] + attention_mask.cpu().tolist() if isinstance(attention_mask, torch.Tensor) else attention_mask +[1]

        # let's check if the length is less than the disered length in that case we change the length 
        pad_length = self.window_size - len(input_ids) +2

        if pad_length > 0:
            # we need to pad 
            input_ids += [0]*pad_length
            attention_mask += [0]*pad_length
        assert len(input_ids) == len(attention_mask)
        return input_ids, attention_mask


    def __getitem__(self, idx):
        text = self.sentences[idx]
        label = self.labels[idx]
        # let's tokenize
        tokenizedText = tokenizer.encode_plus(text, add_special_tokens=False)
        # let's padd the tokens
        input_ids = tokenizedText["input_ids"]
        attention_mask = tokenizedText["attention_mask"]
        if len(input_ids) > 510:
            splittedInput_ids = torch.tensor(input_ids).split(510)
            splittedInput_mask = torch.tensor(attention_mask).split(510)
            for inpID, inpMask in zip(splittedInput_ids, splittedInput_mask):
                assert len(inpID) == len(inpMask)
                input_ids, attention_mask = self._getpadding(inpID, inpMask)
                return idx, {
                    "input_ids": torch.tensor(input_ids),
                    "attention_mask": torch.tensor(attention_mask),
                    "labels": label
                } 
        else:    
            input_ids, attention_mask = self._getpadding(input_ids, attention_mask)
            return idx, {
                "input_ids": torch.tensor(input_ids),
                "attention_mask": torch.tensor(attention_mask),
                "labels": label
            }
    def __len__(self):
        return len(self.sentences)


In [183]:
import random
class ActiveLearning:
    def __init__(self, dataset) -> None:
        self.unlabelledDataset = dataset
        self.labelledDataset = None

    def randomSample(self, k = 200):
        labelledIndexes= random.sample(range(len(self.unlabelledDataset)), k=200)
        unlabelledIndexes = range(len(self.unlabelledDataset))
        # let's remove the labelled Indexes from unlabelled Indexes
        unlabelledIndexes = list(filter(lambda x: x not in labelledIndexes, unlabelledIndexes))
        
        self.labelledDataset = torch.utils.data.Subset(self.unlabelledDataset, labelledIndexes)
        self.unlabelledDataset= torch.utils.data.Subset(self.unlabelledDataset, unlabelledIndexes)
        # now we have the labelled dataset we can substract the labelled datset from unlabelled one
    
    def getlabelledDataset(self, labelledIndexes):
        unlabelledIndexes = range(len(self.unlabelledDataset))
        unlabelledIndexes = list(filter(lambda x: x not in labelledIndexes, unlabelledIndexes))
        self.labelledDataset = torch.utils.data.ConcatDataset([self.labelledDataset, torch.utils.data.Subset(self.unlabelledDataset, labelledIndexes)])
        self.unlabelledDataset= torch.utils.data.Subset(self.unlabelledDataset, unlabelledIndexes)

    @property
    def get_unlabelled_dataset(self):
        return len(self.unlabelledDataset)
    
    @property
    def get_labelled_dataset(self):
        return len(self.labelledDataset)





In [203]:
# k fold cross validation
class dataSplitting:
    def __init__(self, dataset, k=4) -> None:
        self.stratkFold = StratifiedKFold(n_splits=k)
        self.dataset = dataset
        self.multifold = "../datasets/"
    def splitData(self):
        X = self.dataset["preprocessedREVIEW"]
        Y = self.dataset["label"]
        for idx, (train_idx, test_idx) in enumerate(self.stratkFold.split(X,Y)):
            # let's create folder
            path = os.path.join(self.multifold, f"fold_{idx}")
            if not os.path.isdir(path):
                os.mkdir(path)
            # let's save the train and test data inside the corresponding fold 
            train = self.dataset.iloc[train_idx]
            test  = self.dataset.iloc[test_idx]
            train.to_csv(os.path.join(path, "train.csv"), index=False)
            test.to_csv(os.path.join(path, "test.csv"), index=False)




mfolds  = dataSplitting(df)

mfolds.splitData()


In [204]:
dd = dataset(df["preprocessedREVIEW"], df["label"])
acv = ActiveLearning(dd)

In [205]:
acv.randomSample()

In [206]:
acv.get_labelled_dataset, acv.get_unlabelled_dataset

(200, 49800)

In [192]:
dp = list(range(20))
acv.getlabelledDataset(dp)

In [207]:
add_ = torch.utils.data.DataLoader(acv.labelledDataset, batch_size=32)
next(iter(add_))

{'input_ids': tensor([[24483,  4198,    16,  ...,     0,     0,     0],
         [    0,  1264,    34,  ..., 41307,   784,     0],
         [  118,   303,     5,  ...,     0,     0,     0],
         ...,
         [ 9226,    16,    10,  ...,     0,     0,     0],
         [  118,    33,    45,  ...,     0,     0,     0],
         [ 9226,    16,    30,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
         1, 1, 1, 0, 1, 0, 1, 1])}

In [244]:
class strategies:
    def __init__(self, model) -> None:
        self.model = model

    def predict_proba(self, dataloader, rows):
        self.model.eval()
        probs = []
        data = torch.ones([rows, 2])
        start = 0
        with torch.no_grad():
            for  element in tqdm(dataloader):
                out = self.model(**element)
                pred = torch.softmax(out.logits, dim=-1)
                end = start + element["input_ids"].shape[0]
                data[start:end] = pred
                start = end

        return data



    def entropySampling(self, unlabelledDataset):
        dataloader = torch.utils.data.DataLoader(unlabelledDataset, batch_size=32,shuffle=False)
        probs =  self.predict_proba(dataloader, len(unlabelledDataset))
        log_prob = torch.log(probs)
        return (-probs * log_prob).sum(1)


    def entropySamplignDropout(self):
        pass

In [198]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels =2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [209]:
ot = next(iter(add_))

<torch.utils.data.dataset.Subset at 0x1646db610>

In [246]:
st = strategies(model)
ot = st.entropySampling(acv.labelledDataset)
sorted(ot, reverse = True)

100%|██████████| 7/7 [00:52<00:00,  7.51s/it]


[tensor(0.6888),
 tensor(0.6887),
 tensor(0.6887),
 tensor(0.6887),
 tensor(0.6887),
 tensor(0.6887),
 tensor(0.6886),
 tensor(0.6886),
 tensor(0.6886),
 tensor(0.6885),
 tensor(0.6885),
 tensor(0.6885),
 tensor(0.6885),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6884),
 tensor(0.6883),
 tensor(0.6883),
 tensor(0.6883),
 tensor(0.6883),
 tensor(0.6883),
 tensor(0.6882),
 tensor(0.6882),
 tensor(0.6882),
 tensor(0.6882),
 tensor(0.6882),
 tensor(0.6882),
 tensor(0.6882),
 tensor(0.6882),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6881),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880),
 tensor(0.6880

In [224]:
acv.get_labelled_dataset

200