In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys   

In [2]:
url = 'https://raw.githubusercontent.com/akaAgrima/capstone/main/dataset.csv'
tdf = pd.read_csv(url, index_col=0)

In [3]:
# Filter out records ( values in clean_body and tags) that have atleast one of the top tags

x=tdf['Body'] # To store the filtered clean_body values
y=tdf['tags'] # to store the corresponding tags

In [4]:
import ast
for i in range(len(y)):
  res = ast.literal_eval(y[i])
  y[i]=tuple(res)

In [None]:
# Encode the tags(labels) in a binary format in order to be used for training

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
 
yt = mlb.fit_transform(y) 


In [None]:
#yt
mlb.classes_

array(['1', '10', '11', '12', '13', '13-A', '13-B', '14', '15', '16',
       '17', '17-A', '18', '19', '2', '20', '21', '21-A', '21-B', '22',
       '23', '23-A', '24', '25', '26', '27', '28', '28-A', '29', '3',
       '30', '4', '5', '6', '7', '8', '9'], dtype=object)

In [None]:
df = pd.DataFrame(yt, columns=mlb.classes_ )
df

Unnamed: 0,1,10,11,12,13,13-A,13-B,14,15,16,...,28-A,29,3,30,4,5,6,7,8,9
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
448,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
449,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
450,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import re
def pre_process(text):

  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()

  # split text into tokens to remove whitespaces
  tokens = text.split()

  return " ".join(tokens)

In [None]:
# clean the text in Body column
tdf['Body'] = tdf['Body'].apply(pre_process)

In [None]:
#see merge
train_df= pd.concat([tdf['Body'],df],axis=1)
#train_df

columns = ['1','2','3','4','5','6','7','8','9','10','11','12','13A','13B','14','15','16','17','18','19','20','21','21A','21B','21C','22','23','23A','24','25','26','27','28','28A','29']

In [None]:
#train_df.columns

In [None]:
#train_df.head()

In [None]:
target_list = ['1', '10', '11', '12', '13', '13-A', '13-B', '14', '15', '16',
       '17', '17-A', '18', '19', '2', '20', '21', '21-A', '21-B', '22',
       '23', '23-A', '24', '25', '26', '27', '28', '28-A', '29', '3',
       '30', '4', '5', '6', '7', '8', '9']
target=len(target_list)
#print(target)

In [None]:
# hyperparameters
MAX_LEN = 300
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 50
LEARNING_RATE = 0.0001

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Body']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
train_size = 0.8
train_df = train_df.sample(frac=train_size, random_state=200).reset_index(drop=True)
val_df = train_df.drop(train_df.index).reset_index(drop=True)

In [None]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 37)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
val_targets=[]
val_outputs=[]

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
   
 
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      '''# calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
        '''

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #############
############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############
############# Epoch 3: Validation End     #############
############# Epoch 3  Done   #############

############# Epoch 4: Training Start   #############
############# Epoch 4: Training End     #############
############# Epoch 4: Validation Start   #############
############# Epoch 4: Validation End    

In [None]:
# testing
example = 'It is not disputed before us that the requisite ceremonies for a valid marriage under the personal law of the parties have been gone into in this case, although the marriage is null and void by reason of the provisions of sections 5 and 11 of the Hindu Marriage Act, 1955. This Act was passed with the object of amending and modifying the law relating to marriages among Hindus. Under the customary law, there was no restriction for a male Hindu to marry more than one women. This right of the Hindu husband under the customary Hindu Law was curtailed for the first time in the then Bombay Province by enacting the Bombay Prevention of Hindu Bigamous Marriage Act, 1946, provisions whereof declared bigamy to be illegal. Under the provisions of the said Act and other similar laws enacted by other Provincial Legislatures, a second marriage by a Hindu person during the life time of the spouse was declared illegal and going through such a marriage was made penal. Consistent with the object of codifying the marriage laws amongst Hindus, all these State laws were repealed by the Hindu Marriage Act, 1955. Section 5 of the Hindu Marriage Act provides for the conditions for solemnisation of marriage between any two Hindus. Section 11 declares that a marriage solemnised after the commencement of the Act shall be null and void if it contravenes any of conditions specified in Clause (i), (iv) and (v) of section 5. One of the conditions for a marriage as required by section 5 is that neither party has a spouse living at the time of the marriage, and this is condition No. (i) in section 5. Section 11 also gives a remedy to either party to the marriage to file a petition for a declaration by a decree of nullity of marriage on any one of the said three conditions of section 5 being shown to the have been contravened. Obviously, the second marriage in such circumstances being void, it cannot create a legal status of husband and wife between the parties. It is true that section 11 also gives a right to the parties to the marriage to file a petition for a declaration of nullity by a decree of the Court , but the filing of the petition or passing the decree is not a condition precedent for putting an end to the marriage. What ultimately is declared on such a petition is nothing but the status of the party as on the date of the marriage, and therefore, the marriage does not continue to remain valid until a decree is passed. What is null and void cannot be deemed to be in existence for any purposes whatsoever. Under the circumstances, if a marriage is solemnised in contravention of any of the said three conditions referred to in section 5(i), the woman cannot get the status of the wife nor the male gets the status of a husband qua her. The second marriage does not continue to be valid till the passing of the decree for a nullity. The position is also clear from the fact that bigamy is made penal by section 17 of the Hindu Marriage Act which provides that any marriage between two Hindu solemnised after the commencement of this Act is void if at the date of such marriage either party had a husband or wife living and the provisions of sections 494 and 495 of the Indian Penal Code shall apply accordingly. The position is made further clear from the anxiety of the legislature to protect children of such a marriage by providing in section 16 that not withstanding that a marriage is null and void under section 11, any child of such marriage who would have been legitimate if the marriage had been valid, shall be legitimate, whether such child is born before or after the commencement of the Marriage Laws (Amendment) Act, 1976, and whether or not a decree of nullity is granted in respect of that marriage under this Act and whether or not the marriage is held to be void otherwise than on a petition under this Act. However, the rights of such a child are somewhat curtailed in the matter of inheritance to the property, because sub-section (3) of section 16 says that a child of such a marriage would not be entitled to any rights in or to the property of any person other than the parents. Having regard to all these provisions, the marriage of the petitioner with the respondent was void ab initio and the respondent could not get the status of a legally wedded wife inspite of the solemnisation of the marriage under the Hindu Law having gone into. Indeed, Mr. Gavnekar did not dispute this legal position. He, however, contended that the provisions of section 25 of the Hindu Marriage Act conferred a right of maintenance on the second wife and the word "wife" in section 125 of the Code of Criminal Procedure will have to be given a wider meaning as including a Hindu wife whose marriage may be otherwise void.'
example=pre_process(example)
encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    output=np.flip(np.argsort(final_output,axis=1)[0][-5:])
    for i in output:
      print("Section "+ train_df.columns[1:].to_list()[i])

11
5
25
16
3
