In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers==3

Collecting transformers==3
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[?25l[K     |▍                               | 10 kB 21.2 MB/s eta 0:00:01[K     |▉                               | 20 kB 25.6 MB/s eta 0:00:01[K     |█▎                              | 30 kB 29.0 MB/s eta 0:00:01[K     |█▊                              | 40 kB 20.8 MB/s eta 0:00:01[K     |██▏                             | 51 kB 9.3 MB/s eta 0:00:01[K     |██▋                             | 61 kB 10.0 MB/s eta 0:00:01[K     |███                             | 71 kB 8.1 MB/s eta 0:00:01[K     |███▌                            | 81 kB 9.0 MB/s eta 0:00:01[K     |████                            | 92 kB 7.6 MB/s eta 0:00:01[K     |████▍                           | 102 kB 8.2 MB/s eta 0:00:01[K     |████▊                           | 112 kB 8.2 MB/s eta 0:00:01[K     |█████▏                          | 122 kB 8.2 MB/s eta 0:00:01[K     |█████▋                          | 133 kB 8.2 MB/s 

In [None]:
import numpy as np 
import pandas as pd 
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import seaborn as sns
from matplotlib import pyplot as plt
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
  device = torch.device("cuda:0")
  print(torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")
  print("cpu")

cpu


In [None]:
def read_data2(filename):
    # read in csv
    df = pd.read_csv(filename)
    
    df = df.drop('Unnamed: 0', axis = 1)
    return df

# load cleaned file with sentiment data
new = read_data2('/content/drive/MyDrive/updated_bias_articles.csv')
new.head()

Unnamed: 0,Title,Text,Source,Bias,Sentences,Reading,Length
0,Michigan wants to lead U.S. in electric vehicl...,The Dearborn-based automaker chose Tennessee f...,Bridgemi,4,52,72.0,1248
1,Republicans’ hold on Lansing power could grow ...,"From a purely geographic perspective, most of ...",Bridgemi,4,17,84.3,331
2,Dueling gatherings highlight split as Michigan...,“The focus of the last election is always goin...,Bridgemi,4,21,29.2,371
3,Michigan counties dump mask rules for thousand...,"Marquette, Ottawa, Washtenaw and Kent county h...",Bridgemi,4,82,36.3,1500
4,Gov. Gretchen Whitmer signs $70B Michigan budg...,"The finalized legislation, brokered by Whitmer...",Bridgemi,4,66,41.7,1363


In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
classes = ['Left','Center','Lean Right','Right']


In [None]:
class MediaBiasDataset(Dataset):
#Creating a media bias dataset
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
    
  def __len__(self):
    return len(self.reviews)  
  
  def __getitem__(self, item):
    review = str(self.reviews[item]) 
    target = self.targets[item] #bias
    
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True, # addding [CLS] and [SEP] tokens
      max_length=self.max_len,
      return_token_type_ids=False,
      truncation=True, #trunction for max length
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt', #PyTorch tensors
    )
 
    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
new = new.sample(frac=1).reset_index(drop=True)

In [None]:
train=new[0:int(0.9*len(new))]
test=new[int(0.9*len(new)):]

In [None]:
train

Unnamed: 0,Title,Text,Source,Bias,Sentences,Reading,Length
0,Dolly Parton responds to Lil Nas X's cover of ...,Country star and gay icon Dolly Parton respond...,NBC,2,23,47.8,486
1,French average of new COVID-19 cases drops bel...,PARIS: The French seven-day average of new COV...,Channel News,4,6,82.6,164
2,CNBC AND ACORNS ANNOUNCE STRATEGIC PARTNERSHIP,CNBC TO PROVIDE EDITORIAL AND PRODUCTION EXPER...,NBC,2,60,23.9,754
3,Commentary: Why do K-pop fandoms spend so much...,SINGAPORE: The United Nations General Assembly...,Channel News,4,13,34.4,264
4,Japan’s Comeback Game: Can Japan's Gaming Indu...,"About the show:\n\n25 years ago, Japan control...",Channel News,4,9,11.7,179
...,...,...,...,...,...,...,...
1553,"Best sunscreen for kids, according to dermatol...",Our editors independently selected these items...,NBC,2,81,20.6,1417
1554,China roundup: Tesla supplier CATL to buy Cana...,Hello and welcome back to TechCrunch’s China r...,Tech Crunch,2,35,23.5,715
1555,"How to organize your refrigerator, according t...",Our editors independently selected these items...,NBC,2,81,19.5,1690
1556,From a $1.649M corner-unit condo at Yonge & Bl...,See what’s on the market in the GTA with a sel...,The Star,1,47,86.8,497


In [None]:
test

Unnamed: 0,Title,Text,Source,Bias,Sentences,Reading,Length
1558,Framework for Attorney-General to intervene in...,SINGAPORE: Amendments to the law to create a s...,Channel News,4,12,86.0,339
1559,OPP charge third person in alleged $11M COVID-...,Ontario Provincial Police have charged a third...,The Star,1,37,46.8,712
1560,How to make crispy chicken parmesan in your ai...,"Crispy and gooey when done right, but too ofte...",NBC,2,29,11.6,475
1561,Australia: Crocodile sinks his teeth into a fl...,A crocodile leapt out of the water at a wildli...,BBC,2,2,17.6,32
1562,How David Chase got over his fear of a Soprano...,“Jim and I had very little communication in th...,Washington Post,1,7,11.9,108
...,...,...,...,...,...,...,...
1727,Durham PC MPP stripped of legislative role for...,Send this page to someone via email\n\nTORONTO...,Global News,3,6,67.9,126
1728,Freedom of thought is delightful. It can also ...,I’m indebted to the People’s Party candidate i...,The Star,1,47,12.9,709
1729,'Irreplaceable' audio of deceased Innu elders ...,A cultural guardian for the Innu Nation in Lab...,CBC,2,46,47.3,830
1730,"Leafs, Raptors expect Scotiabank Arena to be a...",The Toronto Maple Leafs and Toronto Raptors ar...,The Star,1,24,41.2,479


In [None]:

d_train, d_test = train_test_split(
  train,
  test_size=0.1,
  random_state=RANDOM_SEED
)
d_val, d_test = train_test_split(
  test,
  test_size=0.5,
  random_state=RANDOM_SEED
)

In [None]:
MAX_LEN=160

In [None]:

def dataloader(dataframe, tokenizer, max_len, batch_size):
#dataloader for pytorch dataset

  dataset = MediaBiasDataset(
    reviews = dataframe.Text.to_numpy(), #converting to numpy array
    targets = dataframe.Bias.to_numpy(),  #converting to numpy array
    max_len=max_len, # setting the maximum length
    tokenizer=tokenizer,
    
  )

  return DataLoader(
    dataset,
    num_workers=2,
    batch_size=batch_size,
  )

In [None]:
BATCH_SIZE = 16
train_dataloader = dataloader(d_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_dataloader = dataloader(d_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = dataloader(d_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
class Classifier(nn.Module):
  def __init__(self, n_classes):
    super(Classifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    #using only [CLS] embedding, ignoring sequence output
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
data = next(iter(train_dataloader))
data.keys()

dict_keys(['review_text', 'input_ids', 'attention_mask', 'targets'])

In [None]:
model = Classifier(len(classes))
model = model.to(device)

In [None]:
['targets']

['targets']

In [None]:

input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

In [None]:
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)


In [None]:

def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    targets=targets-1
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)


In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      targets=targets-1
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:

history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(d_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_dataloader,
    loss_fn,
    device,
    len(d_val)
  )
  print(f'Val loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc


Epoch 1/3
----------
Train loss 0.9195554635741494 accuracy 0.5955777460770328
Val loss 0.786500871181488 accuracy 0.6436781609195402

Epoch 2/3
----------
Train loss 0.4236521441489458 accuracy 0.8359486447931527
Val loss 0.7769037286440531 accuracy 0.7126436781609196

Epoch 3/3
----------
Train loss 0.2061714158308777 accuracy 0.9322396576319544
Val loss 0.7069012721379598 accuracy 0.7586206896551724



In [None]:
import joblib


#dump our model into pickle file
joblib.dump(model, '/content/drive/MyDrive/bert_model.pkl')

['/content/drive/MyDrive/bert_model.pkl']

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      targets=targets-1
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_dataloader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=['Left','Center','Lean Right','Right']))

              precision    recall  f1-score   support

        Left       0.82      0.88      0.85        26
      Center       0.70      0.76      0.73        21
  Lean Right       0.00      0.00      0.00         5
       Right       0.94      0.94      0.94        35

    accuracy                           0.83        87
   macro avg       0.61      0.65      0.63        87
weighted avg       0.79      0.83      0.81        87

