In [1]:
%%capture 
!pip uninstall -y tensorflow
!pip install transformers
!pip install tokenizers

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import spacy
import os
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from collections import defaultdict
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

In [3]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available! Get another runtime')
    raise Exception

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [4]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
ROOT_PATH = "./gdrive/My Drive/NLP_Final/Colab"
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED); # This semicolon mutes output

# Load the data

In [6]:
data = pd.read_csv(f"{ROOT_PATH}/Misinformation_Data_v1.csv")
data

Unnamed: 0,title,text,label
0,Google Just Disclosed A Major Windows Bug — An...,"in: Science & Technology (The Verge) Today, Go...",0
1,‘We Know Where You Live’: Trump-Loving Terror...,"Meet James Stachowiak. If he looks familiar, y...",0
2,CUBA’S FIDEL CASTRO DIES…Hundreds Dance In The...,"What will Obama do?Well, Obama has already cal...",0
3,Goldman's Cohn eyed for top Trump budget post:...,NEW YORK (Reuters) - President-elect Donald Tr...,1
4,"5-STAR MOOCH, HER TAXPAYER FUNDED MOM And Mery...",One of the countries Mooch and her taxpayer fu...,0
...,...,...,...
77736,New York protesters camp out at Goldman Sachs ...,NEW YORK (Reuters) - Dozens of protesters gath...,1
77737,PC TYRANNY: University of Oregon Rules That Pr...,21st Century Wire says By cow-towing to studen...,0
77738,,"Stock ""On Fire!""",0
77739,Republican tax plan would deal financial hit t...,WASHINGTON (Reuters) - The Republican tax plan...,1


In [8]:
data['label'].value_counts()

0    45937
1    31804
Name: label, dtype: int64

# Preprocess Data

In [7]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 16
MAX_LEN = 512

In [None]:
class ArticleDataset(Dataset):
  def __init__(self, articles: np.array, targets: np.array, tokenizer: BertTokenizer, max_len: int):
    self.articles = articles
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.articles)
  
  def __getitem__(self, item):
    article = str(self.articles[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      article,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'article_text': article,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
def create_data_loader(articles, targets, tokenizer, max_len, batch_size):
  dataset = ArticleDataset(
    articles,
    targets,
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    dataset,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
class CategoricalClassifier(nn.Module):
  def __init__(self, n_classes: int):
    super(CategoricalClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.text.to_numpy(), data.label.to_numpy(), test_size=0.30, random_state=RANDOM_SEED)

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
train_data_loader = create_data_loader(X_train, y_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(X_test, y_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
model = CategoricalClassifier(len(data['label'].unique()))
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(X_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    len(X_test)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), f'{ROOT_PATH}/best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/10
----------




TypeError: ignored