<a href="https://colab.research.google.com/github/BDH-teacher/Deep_Learning_Audit_code/blob/main/Data_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json

## Collected Dataset
data = [('hide new secretions from the parental units','0'), ('contains no wit , only labored gags','0'), ('that loves its characters and communicates something rather beautiful about human nature', "1"),('remains utterly satisfied to remain the same throughout ',	"0"), ('on the worst revenge-of-the-nerds clichés the filmmakers could dredge up',"0"), ("that 's far too tragic to merit such superficial treatment","0"), ("demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop ." ,"1"), ("of saucy" ,"1"), ("a depressed fifteen-year-old 's suicidal poetry" ,"0"),("are more deeply thought through than in most ` right-thinking ' films" ,"1"),("goes to absurd lengths","0"),("for those moviegoers who complain that ` they do n't make movies like they used to anymore" ,"0"),("the part where nothing 's happening , ","0"),("saw how bad this movie was" ,"0"),("lend some dignity to a dumb story" ,"0"),("the greatest musicians" ,"1"),("cold movie ","0")]


### Training data pre-processing
texts, labels = [], []
label2idx = {'0':0, '1':1}
for item in data:
  text = item[0]

  if text[0] == ' ':
    text = text[1:]
  # additional preprocessing
  text = text.replace("  "," ") ## Replace double space
  text = text.replace(",", "") ## Replace comma to ""
  text = text.lower()  ## Lower cases

  label = label2idx[item[1]]

  texts.append(text)
  labels.append(label)

print("*"*50)
print("Total number of datasets")
print(len(texts))
print(len(labels))
print("*"*50)




**************************************************
Total number of datasets
17
17
**************************************************


In [2]:
#### Split into train/dev/test sets using library
from sklearn.model_selection import train_test_split
### Wrute a code for collecting samples for each class
pos,neg = [], []
for a,b in zip(texts,labels):
  if b == '1':
    pos.append((a,b))
  else:
    neg.append((a,b))

rest_texts, test_texts, rest_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=1)
train_texts, dev_texts, train_labels, dev_labels = train_test_split(rest_texts, rest_labels, test_size=0.1, random_state=1)
data_dict = {'train':{}, 'dev':{}, 'test':{}}
data_dict["train"]["texts"] = train_texts
data_dict["dev"]["texts"] = dev_texts
data_dict["test"]["texts"] = test_texts
data_dict["train"]["labels"] = train_labels
data_dict["dev"]["labels"] = dev_labels
data_dict["test"]["labels"] = test_labels

print("Train Dataset Examples")
print(train_texts[:3])
print(train_labels[:3])
print("*"*50)
print(train_labels)
print(dev_labels)

Train Dataset Examples
['lend some dignity to a dumb story', 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small  personal film with an emotional wallop .', "are more deeply thought through than in most ` right-thinking ' films"]
[0, 1, 1]
**************************************************
[0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0]
[0, 0]


In [3]:
## Construct a vocabulary
from collections import Counter

all_words = []
for item in train_texts:
  all_words += item.split()
for item in dev_texts:
  all_words += item.split()

## Build a dictionary that maps words to integers
counts = Counter(all_words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {'<pad>':0, "<unk>":1}
vocab_to_int.update({word: ii for ii, word in enumerate(vocab,2)})
print(vocab_to_int)
print(len(vocab_to_int))

{'<pad>': 0, '<unk>': 1, 'the': 2, 'to': 3, 'that': 4, 'a': 5, "'s": 6, 'of': 7, 'such': 8, '`': 9, 'they': 10, 'lend': 11, 'some': 12, 'dignity': 13, 'dumb': 14, 'story': 15, 'demonstrates': 16, 'director': 17, 'hollywood': 18, 'blockbusters': 19, 'as': 20, 'patriot': 21, 'games': 22, 'can': 23, 'still': 24, 'turn': 25, 'out': 26, 'small': 27, 'personal': 28, 'film': 29, 'with': 30, 'an': 31, 'emotional': 32, 'wallop': 33, '.': 34, 'are': 35, 'more': 36, 'deeply': 37, 'thought': 38, 'through': 39, 'than': 40, 'in': 41, 'most': 42, 'right-thinking': 43, "'": 44, 'films': 45, 'on': 46, 'worst': 47, 'revenge-of-the-nerds': 48, 'clichés': 49, 'filmmakers': 50, 'could': 51, 'dredge': 52, 'up': 53, 'loves': 54, 'its': 55, 'characters': 56, 'and': 57, 'communicates': 58, 'something': 59, 'rather': 60, 'beautiful': 61, 'about': 62, 'human': 63, 'nature': 64, 'for': 65, 'those': 66, 'moviegoers': 67, 'who': 68, 'complain': 69, 'do': 70, "n't": 71, 'make': 72, 'movies': 73, 'like': 74, 'used': 

In [4]:
import torch
def encode_sentence(sentence):
    """
    Encodes inputs
    Returns input_ids, segment_ids, and attention_mask.
    """
    max_length = 50
    input_ids = []
    for item in sentence.split():
      if item in vocab_to_int:
        input_ids.append(vocab_to_int[item])
      else:
        input_ids.append(vocab_to_int['<unk>'])
    segment_ids = [0]*len(input_ids)
    attention_mask = [1] * len(input_ids) #inputs['attention_mask']
    padding_length = max_length - len(input_ids)
    input_ids += [vocab_to_int['<pad>']] * padding_length
    segment_ids += [0] * padding_length
    attention_mask += [0] * padding_length
    for input_elem in (input_ids, segment_ids, attention_mask):
        assert len(input_elem) == max_length
    return (
        torch.tensor(input_ids).long(),
        torch.tensor(segment_ids).long(),
        torch.tensor(attention_mask).long(),
    )

def encode_label(label):
    """Wraps label in tensor."""

    return torch.tensor(label).long()



In [5]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    """
    Dataset wrapper. Used for storing, retrieving, encoding, caching, and batching samples.
    """
    def __init__(self, sentences, labels):
      ## Process text data (tokenization, encoding) and save this information into self.samples
      self.sentences = sentences
      self.labels = labels
      self.cache = {}

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, i):
        res = self.cache.get(i, None)
        if res is None:
          sentence = self.sentences[i]
          label = self.labels[i]
          input_ids, segment_ids, attention_mask = encode_sentence(sentence)
          label_id = encode_label(label)
          res = ((input_ids, segment_ids, attention_mask, label_id))
          self.cache[i] = res
          return res


In [6]:
train_dataset = TextDataset(train_texts, train_labels)
print(f'train samples = {len(train_dataset)}')
dev_dataset = TextDataset(dev_texts, dev_labels)
print(f'dev samples = {len(dev_dataset)}')
test_dataset = TextDataset(test_texts, test_labels)
print(f'test samples = {len(test_dataset)}')


train_loader = DataLoader(train_dataset, 8, shuffle=True)
dev_loader = DataLoader(dev_dataset, 8, shuffle=True)
test_loader = DataLoader(test_dataset, 8, shuffle=True)

for item in dev_dataset:
  print(item)


train samples = 13
dev samples = 2
test samples = 2
(tensor([102,   3, 103, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), tensor(0))
(tensor([105, 106, 107, 108,   2, 109, 110,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0]), tensor([0, 0, 0

In [7]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    """
    Dataset wrapper. Used for storing, retrieving, encoding, caching, and batching samples.
    """

    def __init__(self,path,processor):
      ## Process text data (tokenization, encoding) and save this information into self.samples
       self.samples = processor.load_samples(path)
       self.cache = {}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, i):
       res = self.cache.get(i, None)
       if res is None:
         sample = self.samples[i]
         sentence, label = sample
         input_ids, segment_ids, attention_mask = encode_sentence(sentence)
         label_id = encode_label(label)
         res = ((input_ids, segment_ids, attention_mask, label_id))
       self.cache[i] = res
       return res




In [9]:
import csv
from tqdm import tqdm
class SST2Processor:
    """Data loader for SST-2."""

    def __init__(self):
        self.label_map = {'0': 0, '1': 1}

    def valid_inputs(self, sentence1, label):
        return len(sentence1) > 0 and label in self.label_map

    def load_samples(self, path,train=True):
        samples = []
        with open(path, newline='') as f:
            reader = csv.reader(f, delimiter='\t')
            next(reader)  # skip header
            desc = f'loading \'{path}\''
            for row in tqdm(reader, desc=desc):
                sentence = row[0]
                label = row[1]
                if self.valid_inputs(sentence, label):
                    label = self.label_map[label]
                    samples.append((sentence, label))
        return samples



In [None]:
processor = SST2Processor()
train_dataset = TextDataset("/content/gdrive/MyDrive/data/SST2/train.tsv", processor)
print(f'train samples = {len(train_dataset)}')
dev_dataset = TextDataset("/content/gdrive/MyDrive/data/SST2/dev.tsv", processor)
print(f'dev samples = {len(dev_dataset)}')
test_dataset = TextDataset("/content/gdrive/MyDrive/data/SST2/test.tsv", processor)
print(f'test samples = {len(test_dataset)}')

for item in dev_dataset:
  print(item)

In [11]:
def encode_sentence(sentence):
    """
    Encodes inputs
    Returns input_ids, segment_ids, and attention_mask.
    """
    max_length = 256

    from transformers import AutoTokenizer
    BERT_MODEL = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

    inputs = tokenizer.encode(
        sentence, max_length=max_length
    )
    input_ids = inputs
    segment_ids = [0]*len(inputs)
    attention_mask = [1] * len(inputs) #inputs['attention_mask']
    padding_length = max_length - len(input_ids)
    input_ids += [tokenizer.pad_token_id] * padding_length
    segment_ids += [0] * padding_length
    attention_mask += [0] * padding_length
    for input_elem in (input_ids, segment_ids, attention_mask):
        assert len(input_elem) == max_length
    return (
        cuda(torch.tensor(input_ids).long()),
        cuda(torch.tensor(segment_ids).long()),
        cuda(torch.tensor(attention_mask).long()),
    )

def encode_label(label):
    """Wraps label in tensor."""

    return cuda(torch.tensor(label)).long()




In [12]:
def cuda(tensor):
    """Places tensor on CUDA device."""
    if torch.cuda.is_available():
      return tensor.cuda()
    else:
      return tensor

In [None]:
processor = SST2Processor()
train_dataset = TextDataset("/content/gdrive/MyDrive/data/SST2/train.tsv", processor)
print(f'train samples = {len(train_dataset)}')
dev_dataset = TextDataset("/content/gdrive/MyDrive/data/SST2/dev.tsv", processor)
print(f'dev samples = {len(dev_dataset)}')
test_dataset = TextDataset("/content/gdrive/MyDrive/data/SST2/test.tsv", processor)
print(f'test samples = {len(test_dataset)}')

for item in dev_dataset:
  print(item)


In [13]:
import torch
import torch.nn as nn

# Define RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        # Define Embedding
        self.embedding = nn.Embedding(len(vocab_to_int), input_size, padding_idx=vocab_to_int['<pad>'])

        # Define the RNN layer
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)

        # Define the fully connected layer to produce outputs
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedding = self.embedding(x)
        # Forward propagate the RNN
        out, hidden = self.rnn(embedding)
        # Take the output from the last time step
        out = self.fc(out[:, -1, :])
        return out
# Example usage
input_size = 10  # Number of input features per time step
hidden_size = 20 # Number of features in the hidden state
num_layers = 2   # Number of RNN layers (stacked)
output_size = 1  # Number of output classes

# Create RNN model
model = RNNModel(input_size, hidden_size, num_layers, output_size)


In [None]:
train_dataset = TextDataset(train_texts, train_labels)
print(f'train samples = {len(train_dataset)}')
dev_dataset = TextDataset(dev_texts, dev_labels)
print(f'dev samples = {len(dev_dataset)}')
test_dataset = TextDataset(test_texts, test_labels)
print(f'test samples = {len(test_dataset)}')


train_loader = DataLoader(train_dataset, 8, shuffle=True)
dev_loader = DataLoader(dev_dataset, 8, shuffle=True)
test_loader = DataLoader(test_dataset, 8, shuffle=True)

##Train API
for train_data in train_loader:
  input_ids, attention_masks, segment_ids, labels = train_data
  out = model(input_ids)
  print(out.shape)
  ## Define a loss..
  ## Backpropagation..

