# Essentially a copy of [this Colab NB](https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX?usp=sharing) from Chris McCormick and Nick Ryan.

In [1]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


# <span style="color:#FF8800"> (Most) Installs

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/f4/9f93f06dd2c57c7cd7aa515ffbf9fcfd8a084b92285732289f4a5696dd91/transformers-3.2.0-py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 5.5MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 29.4MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 45.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K 

In [3]:
#3rd party imports
import numpy as np
import pandas as pd
import torch

#local app & library specific imports
from scipy.special import softmax
from torch.utils.data import DataLoader, TensorDataset

# <span style="color:#FF8800"> Setup & global vars

In [4]:
# alert for cells
from IPython.display import Audio

alert = Audio(np.sin(np.linspace(0, 3000, 20000)), rate=20000, autoplay=True)

In [5]:
%cd drive/'My Drive'/propaganda_bert/

/content/drive/My Drive/propaganda_bert


In [6]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [7]:
#Global vars - use those from the best experimental run

MODEL = "bert-base-cased"
BATCH_SIZE = 16
max_len = 204

output_dir = "selected_model/"

# Load saved model

In [8]:
if MODEL == "bert-base-cased":

  from transformers import BertTokenizer, BertForSequenceClassification

  # Load a trained model and vocabulary that you have fine-tuned
  model = BertForSequenceClassification.from_pretrained(output_dir)
  tokenizer = BertTokenizer.from_pretrained(output_dir)

elif MODEL == "distilbert-base-cased":

  from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

  # Load a trained model and vocabulary that you have fine-tuned
  model = DistilBertForSequenceClassification.from_pretrained(output_dir)
  tokenizer = DistilBertTokenizer.from_pretrained(output_dir)

else:
  raise ValueError('Unknown model specified. Check MODEL var.')

# Copy the model to the GPU.
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
# label2id = model.config.label2id # these are the labels the model already "knows"
id2label = model.config.id2label

# Load test data

In [10]:
# Load the dataset into a pandas dataframe.
test_df = pd.read_csv(
    "datasets/test_data.tsv",
    sep = "\t",
    header = 0,
    index_col = 0
) 

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(test_df.shape[0]))

# dev_df["label_encoded"] = dev_df["label"].map(label2id) #use same mapping from model

sentences = test_df.text.values

Number of test sentences: 1,779



# Tokenize test data

In [11]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_dataloader = DataLoader(prediction_data, batch_size = BATCH_SIZE)



# Generate predictions & evaluate results

In [12]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  # b_input_ids, b_input_mask, b_labels = batch
  b_input_ids, b_input_mask = batch

  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)

print('    DONE.')

alert

Predicting labels for 1,779 test sentences...
    DONE.


# Multi-class classification

In [13]:
#Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis = 0)

predicted_probs = softmax(flat_predictions)

test_df["predicted_label"] = np.argmax(predicted_probs, axis = 1) #Find the index of highest predicted probability, this is the predicted label

In [15]:
test_df['label_text'] = test_df['predicted_label'].map(id2label)

In [16]:
test_df.sample(3)

Unnamed: 0_level_0,filenumber,span_start,span_end,text,predicted_label,label_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
74,813552066,3933,3940,squalid,8,Loaded_Language
1642,833039623,51,69,Carload of crazies,9,"Name_Calling,Labeling"
597,814630609,3191,3215,EU’s unloved draft ‘deal,9,"Name_Calling,Labeling"


In [24]:
test_df['label_text'].value_counts() # No_Propaganda is not an acceptable label for semeval

Loaded_Language                       700
Name_Calling,Labeling                 277
Doubt                                 153
Exaggeration,Minimisation             130
Flag-Waving                           120
Appeal_to_fear-prejudice               98
No_Propaganda                          71
Repetition                             71
Causal_Oversimplification              67
Thought-terminating_Cliches            21
Black-and-White_Fallacy                21
Appeal_to_Authority                    18
Bandwagon,Reductio_ad_hitlerum         14
Slogans                                12
Whataboutism,Straw_Men,Red_Herring      6
Name: label_text, dtype: int64

In [23]:
# test_df.to_csv('semeval_test_preds.txt', columns=['filenumber','label_text','span_start','span_end'],sep='\t', header=False, index=False)

# END