## Sentiment Classification with Huggingface+BERT fine-tuning

- We'll be using the IMDB dataset, which you need to [download](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) and extract, set the `data_base_folder` to the extracted folder

In [2]:
import os
import re
import numpy as np 
import shutil
from sklearn.metrics import accuracy_score

import transformers
from transformers import BertTokenizer, BertModel

import torch
from torch import cuda
from tqdm import tqdm_notebook as tqdm
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cuda'

In [3]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

def load_data(path):
    onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    print('found {} files'.format(len(onlyfiles)))
    all_text = []
    for f in onlyfiles:
        with open('{}/{}'.format(path, f)) as handle:
            lines = clean_text(handle.readlines()[0])
            all_text.append(lines)
        
    return all_text

In [4]:
data_base_folder = 'data'

In [5]:

import pandas as pd
df = pd.read_csv("data/Truth_Seeker_Model_Dataset.csv")
#neg = load_data('{}/train/neg'.format(data_base_folder))
#pos = load_data('{}/train/pos'.format(data_base_folder))
#train_labels = np.array([[1,0]]*len(neg) + [[0,1]]*len(pos))
df.shape

(134198, 9)

In [6]:
df = df[~df['5_label_majority_answer'].isin(['NO MAJORITY'])]
df.shape

(111593, 9)

In [7]:
sentences = pd.DataFrame()
sentences['statement-and-tweet'] = df['target'].astype(str) + ' Statement: '  +  df['statement'] + '| Tweet: ' +df['tweet']
#sentences = df['target'].astype(str) + ' Statement: '  +  df['statement'] + '| Tweet: ' +df['tweet']
sentences

Unnamed: 0,statement-and-tweet
0,True Statement: End of eviction moratorium mea...
2,True Statement: End of eviction moratorium mea...
3,True Statement: End of eviction moratorium mea...
4,True Statement: End of eviction moratorium mea...
5,True Statement: End of eviction moratorium mea...
...,...
134192,False Statement: Joe Bidens great-grandfather ...
134193,False Statement: Joe Bidens great-grandfather ...
134194,False Statement: Joe Bidens great-grandfather ...
134195,False Statement: Joe Bidens great-grandfather ...


In [8]:
def generate_truthfulness_4way(row):
    if row['target'] == True:
        if row['5_label_majority_answer'] == 'Agree':
            return "True"
        elif row['5_label_majority_answer'] == 'Disagree':
            return "False"
        elif row['5_label_majority_answer'] == 'Mostly Agree':
            return "Mostly True"
        elif row['5_label_majority_answer'] == 'Mostly Disagree':
            return "Mostly False"
    else:
        if row['5_label_majority_answer'] == 'Agree':
            return "False"
        elif row['5_label_majority_answer'] == 'Disagree':
            return "True"
        elif row['5_label_majority_answer'] == 'Mostly Agree':
            return "Mostly False"
        elif row['5_label_majority_answer'] == 'Mostly Disagree':
            return "Mostly True"

def generate_truthfulness_2way(row):
    if row['target'] == True:
        if row['3_label_majority_answer'] == 'Agree':
            return "True"
        elif row['3_label_majority_answer'] == 'Disagree':
            return "False"
    else:
        if row['3_label_majority_answer'] == 'Agree':
            return "False"
        elif row['3_label_majority_answer'] == 'Disagree':
            return "True"
        

In [64]:
df2 = pd.DataFrame()
df2['4-way-label'] = df.apply(lambda x: generate_truthfulness_4way(x), axis=1)
df2['2-way-label'] = df.apply(lambda x: generate_truthfulness_2way(x), axis=1)
df2['2-way-label-B'] = df.apply(lambda x: generate_truthfulness_2way(x), axis=1)
df2

Unnamed: 0,4-way-label,2-way-label,2-way-label-B
0,Mostly True,True,True
2,True,True,True
3,Mostly True,True,True
4,True,True,True
5,True,True,True
...,...,...,...
134192,Mostly False,False,False
134193,Mostly False,False,False
134194,False,False,False
134195,Mostly False,False,False


In [65]:
df2['2-way-label'] = df2['2-way-label'].replace({'True': 0, 'False': 1})
df2['2-way-label-B'] = df2['2-way-label-B'].replace({'True': 1, 'False': 0})

  df2['2-way-label'] = df2['2-way-label'].replace({'True': 0, 'False': 1})
  df2['2-way-label-B'] = df2['2-way-label-B'].replace({'True': 1, 'False': 0})


In [66]:
dataset = pd.DataFrame()
dataset['statemen-and-tweet'] = sentences['statement-and-tweet']
dataset['labels1'] = df2['2-way-label']
dataset['labels2'] = df2['2-way-label-B']
dataset

Unnamed: 0,statemen-and-tweet,labels1,labels2
0,True Statement: End of eviction moratorium mea...,0,1
2,True Statement: End of eviction moratorium mea...,0,1
3,True Statement: End of eviction moratorium mea...,0,1
4,True Statement: End of eviction moratorium mea...,0,1
5,True Statement: End of eviction moratorium mea...,0,1
...,...,...,...
134192,False Statement: Joe Bidens great-grandfather ...,1,0
134193,False Statement: Joe Bidens great-grandfather ...,1,0
134194,False Statement: Joe Bidens great-grandfather ...,1,0
134195,False Statement: Joe Bidens great-grandfather ...,1,0


In [67]:
from sklearn.model_selection import train_test_split

# Split the DataFrame into 80% train and 20% test
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=32)


In [68]:
test_dataset

Unnamed: 0,statemen-and-tweet,labels1,labels2
125030,False Statement: Says President Barack Obama s...,1,0
3120,True Statement: The federal American Rescue Pl...,0,1
84384,"False Statement: If I dont take the vaccine, I...",1,0
28919,True Statement: Both the Democratic and Republ...,0,1
58762,"True Statement: ""There are more African Americ...",0,1
...,...,...,...
18158,"True Statement: ""You know what Amazon paid in ...",0,1
70583,True Statement: Americans spend more than $160...,0,1
37418,"True Statement: ""Building a wall"" on the U.S.-...",0,1
25734,True Statement: Says Warren Buffett has public...,0,1


In [81]:
np.transpose(test_dataset[['labels1','labels2']])

Unnamed: 0,125030,3120,84384,28919,58762,78220,27229,106589,128001,63247,...,115814,120898,114441,80090,12826,18158,70583,37418,25734,110611
labels1,1,0,1,0,0,1,0,1,1,0,...,1,1,1,1,0,0,0,0,0,1
labels2,0,1,0,1,1,0,1,0,0,1,...,0,0,0,0,1,1,1,1,1,0


In [82]:
test_data = np.array(test_dataset['statemen-and-tweet'])
test_labels = np.array(test_dataset[['labels1','labels2']])
test_data.shape, test_labels.shape


((22319,), (22319, 2))

In [83]:
test_labels

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [1, 0]])

In [37]:
# this for for when it was a single vector
#test_labels = np.transpose(np.transpose(test_labels[:, np.newaxis]))
#test_labels

array([[1],
       [0],
       [1],
       ...,
       [0],
       [0],
       [1]])

In [84]:
test_data

array(['False Statement: Says President Barack Obama spied on my campaign, and got caught!| Tweet: Trump: "Well, look, the Obama campaign spied on our campaign, &amp; they\'ve been caught, all right?" He added, "It\'s probably treason. It\'s a horrible thing they did... They used the intelligence agencies of our country to spy on my campaign, &amp; they have been caught." \nALL LIES!',
       'True Statement: The federal American Rescue Plan will purchase more food from farmers for distribution."| Tweet: You want to see real systemic racism? Look no further than Joe Bidens American rescue plan. The plan pays up to 120% of Black, Hispanic, Asian or Native American farmers\' outstanding debt. But if youre a white farmer, you get no benefits.',
       'False Statement: If I dont take the vaccine, Im at risk for covid. If I do take the vaccine, Im still at risk for covid PLUS Im at risk for permanent vaccine side effects. Therefore Im reducing my risk by not taking the vaccine. Thats the r

In [39]:
train_data = np.array(train_dataset['statemen-and-tweet'])
train_labels = np.array(train_dataset[['labels1','labels2']])
train_data.shape, train_labels.shape


((89274,), (89274,))

In [40]:
train_labels = np.transpose(np.transpose(train_labels[:, np.newaxis]))
train_labels

array([[0],
       [0],
       [1],
       ...,
       [0],
       [1],
       [0]])

In [41]:
train_data

array(['True Statement: "The Walton family of Walmart ... This one family owns more wealth than the bottom 40 percent of the American people."| Tweet: @glennbeck People are sick of having $1k+ a month taken out of paychecks for health care. When they\'re told the Walton family has more wealth than 50% of Americans combined, the message resonates. You can call them naive, etc but that\'s what Dems said last time about Trump voters too',
       'True Statement: "Todays marijuana is 300 percent to 800 percent more potent than the pot of yesteryear."| Tweet: Always Stay away from using some Gateway drugs such as Nicotine(Socially acceptable but dangerous),Alcohol and Marijuana becz it opens the door to harder and more potent drugs.',
       'False Statement: The southern U.S. border is now open to anyone from anywhere in the world who wishes to enter our country.| Tweet: @Robert58742051 @brguest20 Selfish? You mean like pulling troops out of Afghanistan when trouble is starting and then go

In [5]:
#test_neg = load_data('{}/test/neg'.format(data_base_folder))
#test_pos = load_data('{}/test/pos'.format(data_base_folder))
#test_data = np.array(test_neg+test_pos)
#test_labels = np.array([[1,0]]*len(test_neg) + [[0,1]]*len(test_pos))
#test_data.shape, test_labels.shape

found 12500 files
found 12500 files


((25000,), (25000, 2))

### Torch Datasets

- takes in inputs and outputs/labels
- interfaces with tokenizer
- handles batching

In [42]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

### Bert Class

- first "layer" is a pre-trained BERT model
- you can add whatever layers you want after that

In [43]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()
                   
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

### Helpful Functions

Loss

- This task is binary, so it uses binary crossentropy loss
- Tasks with more labels will use categorical crossentropy
- Tasks that don't have labels, but rather have distributions should use KL divergence
- Tasks that don't have distributions should use something like RMSE loss

Train

- Steps through the data batch by batch
- grabs ids, masks, and token_type_ids which are required inputs for BERT
- inputs are passed through the model, compared to targets, computes loss function, backprops

Validation

- Takes a model, passes inputs
- Need to use the targets from here because they are potentially shuffled!

In [44]:
def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

### The Tokenizer

- Converts a raw string to the ids, masks, and token_type_ids

In [45]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# what does the tokenizer do?
print(train_data[5])

tokenizer.encode_plus(
            train_data[5],
            None,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )

False Statement: Says John F. Kennedy said, "If a Supreme Court Justice died one day before the election, it would be more Constitutional to indefinitely postpone the election than postpone the confirmation a single day."| Tweet: Wow...Justice Kennedy, who could be the deciding vote in the gay marriage Supreme Court vote said to remember the children...




{'input_ids': [101, 6270, 4861, 1024, 2758, 2198, 1042, 1012, 5817, 2056, 1010, 1000, 2065, 1037, 4259, 2457, 3425, 2351, 2028, 2154, 2077, 1996, 2602, 1010, 2009, 2052, 2022, 2062, 6543, 2000, 20733, 2695, 29513, 1996, 2602, 2084, 2695, 29513, 1996, 13964, 1037, 2309, 2154, 1012, 1000, 1064, 1056, 28394, 2102, 1024, 10166, 1012, 1012, 1012, 3425, 5817, 1010, 2040, 2071, 2022, 1996, 10561, 3789, 1999, 1996, 5637, 3510, 4259, 2457, 3789, 2056, 2000, 3342, 1996, 2336, 1012, 1012, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Training setup

- hyperparameters
- setup dataset
- setup parameters
- setup dataloader

In [52]:
MAX_LEN = 410
BATCH_SIZE = 32
EPOCHS = 3
NUM_OUT = 2 # binary task
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(train_data, torch.from_numpy(train_labels), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_data, torch.from_numpy(test_labels), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

### Train,  Evaluate

- model.to -> send to GPU, if available (anything computed should be put onto the GPU)
- setup optimizer - could use Stochastic Gradient Descent, but ADAM tends to work better
- for each epoch, train, show the loss, evaluate on the test data

In [53]:
model = BERTClass(NUM_OUT)
model.to(device)    

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = torch.max(targs, dim=1)
    print('arracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = torch.max(targs, dim=1)
    print('arracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/2790 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


ValueError: Using a target size (torch.Size([32, 1])) that is different to the input size (torch.Size([32, 2])) is deprecated. Please ensure they have the same size.

In [51]:
guesses

torch.return_types.max(
values=tensor([0.7311, 0.7311, 0.7311,  ..., 0.7311, 0.7311, 0.7311]),
indices=tensor([0, 0, 0,  ..., 0, 0, 0]))

In [50]:
targets.indices

tensor([0, 0, 0,  ..., 0, 0, 0])