In [None]:
!pip install transformers
!pip3 -q install emoji

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 6.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 70.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 75.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from transformers import (get_linear_schedule_with_warmup,AdamW,AutoModel, AutoTokenizer,
                            AutoModelForSequenceClassification)
from torch.utils.data import (TensorDataset,DataLoader,
                             RandomSampler, SequentialSampler, Dataset)

def prepare_model(model_class="vinai/bertweet-base",num_classes=2,model_to_load=None,total_steps=-1):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_class,
        num_labels = num_classes,  
        output_attentions = False, 
        output_hidden_states = False,
    )

    optimizer = AdamW(model.parameters(),
                    lr = 5e-5,
                    eps = 1e-8
                    )
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    if model_to_load is not None:
        try:
            model.roberta.load_state_dict(torch.load(model_to_load))
            print("LOADED MODEL")
        except:
            pass
    return model, optimizer, scheduler

In [None]:
# Load data

import ast
import pandas as pd

def load_data_lists(path):
    data_points_lists = []
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()

        for line in lines:
            try:
                data_points_lists.append(ast.literal_eval(line))
            except:
                # Ignore lines with errors
                pass

    print('Found {} lines in "{}".'.format(len(lines), path))
    print('Successfully loaded {} data points from "{}".'.format(len(data_points_lists), path))
    
    return data_points_lists

COLUMN_NAMES = ['ID', 'Text', 'Sarcastic']

def construct_df(data_points_lists, column_names=COLUMN_NAMES):
    df = pd.DataFrame(data_points_lists, columns=column_names)
    df['ID'] = pd.to_numeric(df['ID'])
    df['Sarcastic'] = df['Sarcastic'].astype('bool')
    
    return df

In [None]:
train_df = construct_df(load_data_lists('train.txt'))

train_df.info()
train_df.head()

Found 29040 lines in "train.txt".
Successfully loaded 29040 data points from "train.txt".
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29040 entries, 0 to 29039
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         29040 non-null  int64 
 1   Text       29040 non-null  object
 2   Sarcastic  29040 non-null  bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 482.2+ KB


Unnamed: 0,ID,Text,Sarcastic
0,910308516510011393,most # funny quotes : 21 snarky and # funny qu...,True
1,725333760762363905,spurs # creativethinking ! <url>,True
2,840006160660983809,<user> thanks for showing up for our appointme...,True
3,854334602516733952,only a hardcore fan of sir jonny sins will get...,True
4,908913372199915520,haha . # lol,True


In [None]:
valid_df = construct_df(load_data_lists('valid.txt'), column_names=COLUMN_NAMES + ['Sarc_2'])

valid_df.info()
valid_df.head()

Found 2410 lines in "valid.txt".
Successfully loaded 2410 data points from "valid.txt".
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         2410 non-null   int64 
 1   Text       2410 non-null   object
 2   Sarcastic  2410 non-null   bool  
 3   Sarc_2     2410 non-null   int64 
dtypes: bool(1), int64(2), object(1)
memory usage: 59.0+ KB


Unnamed: 0,ID,Text,Sarcastic,Sarc_2
0,915657464401580032,whew ... that extra <num> miles today to the g...,True,1
1,854678856724340736,""" oh , good . now no one will know we 're here...",True,1
2,904892917277274112,how much of it you think is true ? has this be...,True,1
3,855466461296504832,<user> finally found proof that the earth is f...,True,1
4,927373534652805120,many ways to overcome tension & fear but nothi...,True,1


In [None]:
test_df = construct_df(load_data_lists('test.txt'), column_names=COLUMN_NAMES + ['Sarc_2'])

test_df.info()
test_df.head()

Found 2409 lines in "test.txt".
Successfully loaded 2409 data points from "test.txt".
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2409 entries, 0 to 2408
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         2409 non-null   int64 
 1   Text       2409 non-null   object
 2   Sarcastic  2409 non-null   bool  
 3   Sarc_2     2409 non-null   int64 
dtypes: bool(1), int64(2), object(1)
memory usage: 58.9+ KB


Unnamed: 0,ID,Text,Sarcastic,Sarc_2
0,862902619928506372,i am guessing # netflix no longer lets you gra...,True,1
1,892551658487631873,it 's the insensitive strikeouts at suntrust p...,True,1
2,853143461360480256,"following the path of the river calder , so .....",True,1
3,918423568823840768,# westernsahara # authority has no lessons 2ge...,True,1
4,731617467718610944,hey <user> great sale !,True,1


In [None]:
# Shuffle
from sklearn.utils import shuffle

train_df = shuffle(train_df, random_state=42)
valid_df = shuffle(valid_df, random_state=42)
test_df = shuffle(test_df, random_state=42)

In [None]:
epochs = 10
total_steps = len(train_df) * epochs

model, optimizer, scheduler = prepare_model("vinai/bertweet-base" ,num_classes=2, model_to_load=None, total_steps = total_steps)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [None]:
def bert_encode(df, tokenizer):
    input_ids = []
    attention_masks = []
    # print(df)
    for sent in df[['Text']].values:
        sent = sent.item()
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            max_length = 128,           
                            pad_to_max_length = True,
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
           
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_word_ids': input_ids,
    'input_mask': attention_masks}

    return inputs

In [None]:
from torch.utils.data import (TensorDataset,DataLoader,
                             RandomSampler, SequentialSampler, Dataset)

def prepare_dataloaders(train_df,test_df,batch_size=64):
    # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)
    
    tweet_train = bert_encode(train_df, tokenizer)
    tweet_train_labels = train_df['Sarcastic'].astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)

    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)

    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )


    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    return train_dataloader, test_dataloader

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [None]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return np.array(preds)

In [None]:
from sklearn.metrics import accuracy_score

val_loss_fn = torch.nn.CrossEntropyLoss()

def train(model,optimizer,scheduler,train_dataloader, epochs, val_dataloader):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('Using {} as device.'.format(device))
    model.to(device)
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()        
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

        # Validation loss
        val_preds = torch.Tensor(predict(model, val_dataloader))
        val_loss = val_loss_fn(val_preds, torch.LongTensor(valid_df['Sarcastic']))
        val_acc = accuracy_score(valid_df['Sarcastic'], val_preds.argmax(axis=1))
        print('Valid loss: {}, Valid accuracy: {}'.format(val_loss, val_acc))
        val_loss.detach()
        optimizer.zero_grad()

        # state = {'epoch': epoch_i, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
        # torch.save(model, '/content/gdrive/MyDrive/CSE8803_DLT/berttweet_{}'.format(epoch_i))

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
train_dataloader,test_dataloader = prepare_dataloaders(train_df, test_df)
_,val_dataloader = prepare_dataloaders(train_df, valid_df)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import os
import random
import time
import datetime
import torch
import argparse
import numpy as np
import pandas as pd
from torch.nn import functional as F
from transformers import (get_linear_schedule_with_warmup,AdamW,AutoModel, AutoTokenizer,
                            AutoModelForSequenceClassification)
from torch.utils.data import (TensorDataset,DataLoader,
                             RandomSampler, SequentialSampler, Dataset)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import time

train(model,optimizer,scheduler,train_dataloader, 2, val_dataloader)

Using cuda as device.

Training...
  Batch    40  of    454.    Elapsed: 0:00:29.
  Batch    80  of    454.    Elapsed: 0:00:58.
  Batch   120  of    454.    Elapsed: 0:01:27.
  Batch   160  of    454.    Elapsed: 0:01:56.
  Batch   200  of    454.    Elapsed: 0:02:25.
  Batch   240  of    454.    Elapsed: 0:02:54.
  Batch   280  of    454.    Elapsed: 0:03:23.
  Batch   320  of    454.    Elapsed: 0:03:52.
  Batch   360  of    454.    Elapsed: 0:04:21.
  Batch   400  of    454.    Elapsed: 0:04:50.
  Batch   440  of    454.    Elapsed: 0:05:19.

  Average training loss: 0.27
  Training epoch took: 0:05:29
Valid loss: 0.2997133135795593, Valid accuracy: 0.8813278008298755

Training...
  Batch    40  of    454.    Elapsed: 0:00:29.
  Batch    80  of    454.    Elapsed: 0:00:58.
  Batch   120  of    454.    Elapsed: 0:01:27.
  Batch   160  of    454.    Elapsed: 0:01:56.
  Batch   200  of    454.    Elapsed: 0:02:25.
  Batch   240  of    454.    Elapsed: 0:02:54.
  Batch   280  of    454

In [None]:
torch.save(model, '/content/drive/MyDrive/CSE8803_DLT/berttweet_2epoch.pt')

In [None]:
train(model,optimizer,scheduler,train_dataloader, 2, val_dataloader)

Using cuda as device.

Training...
  Batch    40  of    454.    Elapsed: 0:00:29.
  Batch    80  of    454.    Elapsed: 0:00:58.
  Batch   120  of    454.    Elapsed: 0:01:27.
  Batch   160  of    454.    Elapsed: 0:01:56.
  Batch   200  of    454.    Elapsed: 0:02:25.
  Batch   240  of    454.    Elapsed: 0:02:54.
  Batch   280  of    454.    Elapsed: 0:03:23.
  Batch   320  of    454.    Elapsed: 0:03:52.
  Batch   360  of    454.    Elapsed: 0:04:21.
  Batch   400  of    454.    Elapsed: 0:04:50.
  Batch   440  of    454.    Elapsed: 0:05:19.

  Average training loss: 0.08
  Training epoch took: 0:05:29
Valid loss: 0.30426856875419617, Valid accuracy: 0.9

Training...
  Batch    40  of    454.    Elapsed: 0:00:29.
  Batch    80  of    454.    Elapsed: 0:00:58.
  Batch   120  of    454.    Elapsed: 0:01:27.
  Batch   160  of    454.    Elapsed: 0:01:56.
  Batch   200  of    454.    Elapsed: 0:02:25.
  Batch   240  of    454.    Elapsed: 0:02:54.
  Batch   280  of    454.    Elapsed: 

In [None]:
result = predict(model,test_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)
pred_scores = softmax(result, axis=1)[:, 1]

In [None]:
pd.value_counts(pred_labels)

0    1303
1    1106
dtype: int64

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, accuracy_score, recall_score

print(accuracy_score(test_df['Sarcastic'],pred_labels))
print(f1_score(test_df['Sarcastic'],pred_labels))
print(precision_score(test_df['Sarcastic'],pred_labels))
print(recall_score(test_df['Sarcastic'],pred_labels))

0.8933167289331673
0.8782567503552818
0.8381555153707052
0.9223880597014925


In [None]:
print(accuracy_score(test_df['Sarc_2'],pred_labels))
print(f1_score(test_df['Sarc_2'],pred_labels))
print(precision_score(test_df['Sarc_2'],pred_labels))
print(recall_score(test_df['Sarc_2'],pred_labels))

0.8484848484848485
0.8232445520581114
0.7685352622061483
0.886339937434828


In [None]:
_, valid_loader = prepare_dataloaders(train_df, valid_df)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
valid_results = predict(m,valid_loader)

v_pred_labels = np.argmax(valid_results, axis = 1)
v_pred_scores = softmax(valid_results, axis=1)[:, 1]

In [None]:
print(accuracy_score(valid_df['Sarcastic'],v_pred_labels))
print(f1_score(valid_df['Sarcastic'],v_pred_labels))
print(precision_score(valid_df['Sarcastic'],v_pred_labels))
print(recall_score(valid_df['Sarcastic'],v_pred_labels))

0.8742738589211618
0.8503703703703703
0.84
0.861


In [None]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

        # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    tweet_train = bert_encode(train_df, tokenizer)
    tweet_train_labels = train_df['Sarcastic'].astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)

    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)

    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)
tweet_test = bert_encode(test_df, tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
test_df.iloc[0]['Text']

'i am guessing # netflix no longer lets you grab screens of movies . that & the new rating system is so awesome . '

In [None]:
tweet_test

{'input_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_word_ids': tensor([[    0,    37,   155,  ...,     1,     1,     1],
         [    0,    18,    69,  ...,     1,     1,     1],
         [    0,   676,     6,  ...,     1,     1,     1],
         ...,
         [    0,   128,    38,  ...,     1,     1,     1],
         [    0, 20871,  9478,  ...,     1,     1,     1],
         [    0,    26,    18,  ...,     1,     1,     1]])}

In [None]:
 for batch in test_dataloader:
    b_input_ids = batch[0].to('cuda')
    b_input_mask = batch[1].to('cuda')
    with torch.no_grad():        
        outputs = m(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, output_attentions = True)
        logits = outputs.logits
        out_attentions = outputs.attentions

    logits = logits.detach().cpu().numpy()
    print(logits)
    break

[[-2.054214   2.0261955]
 [-2.0641382  2.0245357]
 [-1.0652117  1.0406414]
 [-2.0556746  2.024575 ]
 [-2.0615063  2.031022 ]
 [-2.0695105  2.0404446]
 [-2.0244899  1.9941506]
 [-2.063401   2.0343156]]
