<a href="https://colab.research.google.com/github/Crliu4/supreme_court_verdict_predictor/blob/main/BERT_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing the transformers library and additional libraries if looking process 

!pip install -q transformers

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [None]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Data Preparation

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')
# PATH = "gdrive/Shared with me/CAPP 30255 Final Project/"

# Carolyn's path
#PATH = "gdrive/MyDrive/"

# Eujene's path
#PATH = "gdrive/MyDrive/CAPP 30255 Final Project/"
#Maggie's attempt (changed the PATH, could be deleted when working tgt :)
PATH = "gdrive/MyDrive/Colab Notebooks/datasets/"

In [None]:
#read the orginal data and keep columns we want
df = pd.read_csv(PATH + 'sc_conv_level.csv')
df.drop(columns = ["Unnamed: 0"], inplace = True)
df = df[['conversation_id', 'finally_cleaned', 'meta.win_side']]
df['text'] = df.groupby(['conversation_id'])['finally_cleaned'].transform(lambda x: ','.join(x))
df = df[['conversation_id', 'text', 'meta.win_side']]
df = df.drop_duplicates().reset_index()

In [None]:
#process the df to bring in win_side_lst, which is a desired paramter for our pretrained model
df1 = df[['conversation_id', 'text','meta.win_side']]
df1.rename(columns={"meta.win_side":"win_side"}, inplace=True)
df1 = pd.get_dummies(df1, columns=['win_side'])
df1['win_side_lst'] = df1[df1.columns[2:]].values.tolist()
df1.drop(columns=['win_side_0'], inplace=True)
df1.rename(columns={"win_side_1":"win_side"}, inplace=True)

In [None]:
#split the text to make sure each row is less than 512 words, so that bert could handle it
df1['text_split'] = df1['text'].apply(lambda x: np.array(x.split(' '))) 
df1['text_splits'] = df1['text_split'].apply(lambda x: np.array_split(x, 10))
df2 = df1.explode('text_splits')
df2.shape

In [None]:
#do sanity check
df2['length'] = df2['text_splits'].apply(lambda x: x.shape[0])
df2['length'].describe()

In [None]:
#perform value counts on labels, and see if oversampling is needed after splitting
df2['text'] = df2['text_splits'].apply(lambda x: ' '.join(list(x)))
df3 = df2[['conversation_id', 'text', 'win_side_lst', 'win_side']]
df3['win_side'].value_counts()

## Defining and Building the model

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 512 #previously was 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3 #previously was 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')#return_overflowing_tokens=True) #add return overflowing tokens
# hugging face tokenizer can return overflow tokens

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.target = self.data.win_side_lst
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        #conv_id = self.data['conversation_id'][index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.target[index], dtype=torch.float),
            #'conv_id': torch.tensor(conv_id)

        }

In [None]:
train_size = 0.8

conv_id = df3['conversation_id'].unique()
conv_df = pd.DataFrame(conv_id)
train_convos = conv_df.sample(frac=train_size,random_state=200)
test_convos = conv_df.drop(train_convos.index).reset_index(drop=True)

train_dataset = train_convos.merge(df3, left_on=0, right_on='conversation_id')
train_dataset = train_dataset[['conversation_id', 'text', 'win_side_lst', 'win_side']]

test_dataset = test_convos.merge(df3, left_on=0, right_on='conversation_id')
test_dataset = test_dataset[['conversation_id', 'text', 'win_side_lst']]

### Oversampling 

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
ros = RandomOverSampler(random_state=42)

In [None]:
X_train_ros, y_train_ros= ros.fit_resample(np.array(train_dataset['text']).reshape(-1,1), train_dataset['win_side'])
df_train = pd.DataFrame(pd.Series(X_train_ros.flatten()).to_frame().join(y_train_ros))
df_train.rename(columns={0:"text"}, inplace=True)
df_train = pd.get_dummies(df_train, columns=['win_side'])
df_train['win_side_lst'] = df_train[df_train.columns[1:]].values.tolist()

In [None]:
#get the train dataset and test dataset ready
train_dataset = df_train[['text', 'win_side_lst']]
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [None]:
#define the params and get the training and test loaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### Define our own BERT Class

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        # self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict = False, config=configuration)       
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict = False, hidden_dropout_prob = 0.5,
                attention_probs_dropout_prob = 0.5, classifier_dropout = 0.5)       
        self.l2 = torch.nn.Dropout(0.5)
        self.l3 = torch.nn.Linear(768, 2)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [None]:
# This combination helps stabilize the training process and avoids numerical instability that can occur when applying the sigmoid and cross-entropy separately.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
model = BERTClass()
model.to(device)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Training

In [None]:
def train(epoch):
    curr_loss = float('inf')
    model.train()

    model_path = "./state_dict.pt"
    for i,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if i%1000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            if float(loss.item()) < curr_loss:        
              torch.save(model.state_dict(), model_path)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(2):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.6506133079528809
Epoch: 0, Loss:  0.5929352641105652
Epoch: 1, Loss:  0.5845842957496643
Epoch: 1, Loss:  0.5522140264511108


## Validation

In [None]:
model_path = "./state_dict.pt"
model.load_state_dict(torch.load(model_path))

In [None]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    fin_conv = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            #conv_id = data['conv_id']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['target'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            #fin_conv.append(conv_id)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5

In [None]:
counter = 0
for i, val in enumerate(outputs):
  if list(val) == targets[i]:
    counter+=1 

In [None]:
counter / len(outputs)

0.6889460154241646