In [1]:
import os
import pandas as pd

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing

Using TensorFlow backend.


In [2]:
train_file_path='../../dataset/UCF/train/'

In [3]:
df_UCF_train=pd.concat([pd.read_pickle(train_file_path+file, compression='gzip') for file in os.listdir(train_file_path)], ignore_index=True)

In [4]:
df_UCF_train['input']= df_UCF_train['TAXPAYER_NAME']+' '+df_UCF_train['mission_spellchk']+' '+df_UCF_train['prgrm_dsc_spellchk']

In [5]:
df_UCF_train.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,...,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1,mission,prgrm_dsc,mission_spellchk,prgrm_dsc_spellchk,input
52644,93493120000000.0,952479348,EFILE,,,,,The Group Home Program was terminated effectiv...,"Counseling & Research Associates, Inc. dba Mas...",The Mental Health Program provides non-residen...,...,COUNSELING AND RESEARCH ASSOCIATES INC DBA MAS...,201406.0,2015.0,1.0,P,"COUNSELING & RESEARCH ASSOCIATES, INC. DBA MAS...",THE GROUP HOME PROGRAM IS PROVIDED TO BOYS RAN...,"COUNSELLING & RESEARCH ASSOCIATES , INC. DBA M...",THE GROUP HOME PROGRAM IS PROVIDED TO BOYS RAN...,COUNSELING AND RESEARCH ASSOCIATES INC DBA MAS...
110448,93492130000000.0,330526546,EFILE,TO PROVIDE READING PROGRAM AND MATERIAL FOR JU...,TO SUPPORT PUBLIC LIBRARY,,,,,,...,FRIENDS OF THE JAMES S THALMAN,201312.0,2014.0,1.0,B,TO SUPPORT PUBLIC LIBRARY,TO PROVIDE READING PROGRAM AND MATERIAL FOR JU...,TO SUPPORT PUBLIC LIBRARY,TO PROVIDE READING PROGRAM AND MATERIAL FOR JU...,FRIENDS OF THE JAMES S THALMAN TO SUPPORT PUBL...
62786,93493130000000.0,411551248,EFILE,,,,,,OPERATION OF AN ANIMAL SHELTER FOR UNWANTED AN...,OPERATION OF AN ANIMAL SHELTER FOR UNWANTED AN...,...,GREAT RIVER RESCUE,201512.0,2016.0,1.0,D,OPERATION OF AN ANIMAL SHELTER FOR UNWANTED AN...,OPERATION OF AN ANIMAL SHELTER FOR UNWANTED AN...,OPERATION OF AN ANIMAL SHELTER FOR UNWANTED AN...,OPERATION OF AN ANIMAL SHELTER FOR UNWANTED AN...,GREAT RIVER RESCUE OPERATION OF AN ANIMAL SHEL...


In [6]:
# Create sentence and label lists
sentences = df_UCF_train.input.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = preprocessing.LabelEncoder().fit_transform(df_UCF_train.NTEE1.values)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'singing', 'river', 'education', 'association', 'provide', 'child', 'care', 'services', 'to', 'disadvantaged', 'children', '.', 'the', 'organization', 'provides', 'complete', 'child', 'care', 'services', 'emphasizing', 'a', 'quality', 'education', ',', 'health', 'and', 'nutrition', 'program', 'for', 'disadvantaged', 'children', 'ages', 'three', 'through', 'five', 'in', 'george', 'co', '.', ',', 'mississippi', '.', '[SEP]']


In [8]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (775 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1730 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (3482 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (618 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1018 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence lengt

In [9]:
input_ids[0:2]

array([[  101,  4823,  2314,  2495,  2523,  3073,  2775,  2729,  2578,
         2000, 27322,  2336,  1012,  1996,  3029,  3640,  3143,  2775,
         2729,  2578, 22671,  1037,  3737,  2495,  1010,  2740,  1998,
        14266,  2565,  2005, 27322,  2336,  5535,  2093,  2083,  2274,
         1999,  2577,  2522,  1012,  1010,  5900,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [10]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [11]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [12]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [13]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [21]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(df_UCF_train.NTEE1.unique()))

In [22]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [23]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [24]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

In [None]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    
    # Training
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()


        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]