In [1]:
!pip install transformers
import torch
dev = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(dev)
print(dev)

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/cd/38/c9527aa055241c66c4d785381eaf6f80a28c224cae97daa1f8b183b5fabb/transformers-2.9.0-py3-none-any.whl (635kB)
[K     |████████████████████████████████| 645kB 4.8MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 22.4MB/s 
[?25hCollecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 34.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K    

In [0]:
import os
import urllib.request
import tarfile

urllib.request.urlretrieve('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', 'dataset.gz')
with tarfile.open('dataset.gz', 'r:gz') as tar:
    tar.extractall()

In [0]:
path = "aclImdb/{}/{}/"
trainfils, testfils = [], []
for g,collection in {'train': trainfils, 'test': testfils}.items() :
  for i,p in enumerate(['neg','pos']) :
    folder = path.format(g,p)
    for name in os.listdir(folder) :
      record = {
          'name' : name,
          'text' : open(folder+name).read(),
          'label' : i
      }
      collection.append(record)

import pandas as pd

train_df = pd.DataFrame(trainfils)
test_df = pd.DataFrame(testfils)

In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import re
nltk.download('punkt')
nltk.download('stopwords')
tokenizer = RegexpTokenizer(r'\w+')
tqdm.pandas()

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

MAX_LEN = 128
def make_clean(s) :
  s = cleanhtml(s)
  for i in range(10) :
    s = s.replace(str(i), ' ')
  tokens = np.array(tokenizer.tokenize(s.lower()))
  tokens = tokens[~np.isin(tokens, stopwords.words())]
  return ' '.join(tokens)

train_df['clean'] = train_df['text'].progress_apply(make_clean)
test_df['clean'] = test_df['text'].progress_apply(make_clean)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [5]:
from transformers import BertTokenizer
btokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

MAX_LEN = 128
for df in [train_df,test_df] :
    input_ids = []
    attention_masks = []

    for sent in tqdm(df['clean']):
        encoded_dict = btokenizer.encode_plus(
                            sent,
                            add_special_tokens = True,
                            max_length = MAX_LEN,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                      )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    df['input_ids'] = input_ids
    df['attention_masks'] = attention_masks

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [0]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32

input_ids = torch.cat(train_df['input_ids'].tolist(), dim=0)
attention_masks = torch.cat(train_df['attention_masks'].tolist(), dim=0)
labels = torch.tensor(train_df['label'].tolist())
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

input_ids = torch.cat(test_df['input_ids'].tolist(), dim=0)
attention_masks = torch.cat(test_df['attention_masks'].tolist(), dim=0)
labels = torch.tensor(test_df['label'].tolist())
val_dataset = TensorDataset(input_ids, attention_masks, labels)
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [7]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch.nn as nn

# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         self.bert = BertForSequenceClassification.from_pretrained(
#                       "bert-base-uncased",
#                       num_labels = 2,
#                       output_attentions = False,
#                       output_hidden_states = False,
#                     )

#     def forward(self, x):
#         x = torch.softmax(self.bert(x)[0], dim=1)
#         return x

# model = Net()
model = BertForSequenceClassification.from_pretrained(
                      "bert-base-uncased",
                      num_labels = 2,
                      output_attentions = False,
                      output_hidden_states = False,
                    )
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
from transformers import get_linear_schedule_with_warmup
epochs = 50

optimizer = AdamW(model.parameters(), lr = 2e-4)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [9]:
import random
import numpy as np

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def train_epoch(model) :
    total_train_loss = 0
    for batch in train_dataloader :
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()        
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    return avg_train_loss

def cal_metrics(y_true, y_pred) :
  fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1)
  return {
      "Accuracy": metrics.accuracy_score(y_true, y_pred),
      "AUC" : metrics.auc(fpr, tpr),
      "f1" : metrics.f1_score(y_true, y_pred, average='macro'),
      "Recall" : metrics.recall_score(y_true, y_pred, average='macro'),
      "Precision" : metrics.precision_score(y_true, y_pred, average='macro'),
  }



from sklearn import metrics
from scipy.special import softmax

def test_epoch(model) :
  true_labels, predictions = [], []
  total_eval_loss = 0
  for batch in validation_dataloader :
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      with torch.no_grad():        
          (loss, logits) = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask,
                                  labels=b_labels)
      total_eval_loss += loss.item()
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      true_labels += label_ids.tolist()
      predictions += softmax(logits, axis=1).argmax(axis=1).tolist()
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  return avg_val_loss, cal_metrics(true_labels, predictions)

train_losses = list()
test_losses = list()
test_metrics = list()
for epoch_i in tqdm(range(epochs)):
  model.train()
  train_losses.append(train_epoch(model))
  model.eval()
  test_res = test_epoch(model)
  test_losses.append(test_res[0])
  test_metrics.append(test_res[1])

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: ignored

In [10]:
df = pd.DataFrame(test_metrics)
df['loss'] = test_losses
df['train_loss'] = train_losses
df.to_csv('records.csv')
df.head()

Unnamed: 0,Accuracy,AUC,f1,Recall,Precision,loss,train_loss
0,0.5,0.5,0.333333,0.5,0.25,0.693517,0.675016
1,0.5,0.5,0.333333,0.5,0.25,0.69725,0.70551
2,0.5,0.5,0.333333,0.5,0.25,0.69459,0.704158
3,0.5,0.5,0.333333,0.5,0.25,0.699821,0.701832
4,0.5,0.5,0.333333,0.5,0.25,0.694111,0.701306
