In [1]:
import os
import tempfile
# For N-dimensional array manipulation
import numpy as np
# Plotting library
import matplotlib.pyplot as plt
# For data analysis and data structures in DataFrames
import pandas as pd
# For data visualization
import seaborn as sns

# For machine learning algorithms and evaluation metrics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics

#import tensorflow
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [2]:
# Load dataset
df = pd.read_csv('../toy_model/media_bias.csv')

# Clean dataset
df = df[df.Label_bias != 'No agreement']
df = df[df.article != 'NaN']
df = df[df.sentence != 'NaN']

# Replace label with 0, 1
df['Label_bias'] = df['Label_bias'].replace('Biased', 1)
df['Label_bias'] = df['Label_bias'].replace('Non-biased', 0)

# Only use sentence column and bias column
df = df[['sentence', 'Label_bias']]
df = df.rename(columns={'sentence': 'text', 'Label_bias': 'label'})

# Remove numbers from all strings in text
df['text'] = df['text'].str.replace('\d+', '', regex=True)

# # Remove punctuation from all strings in X
df['text'] = df['text'].str.replace('[^\w\s]','',regex=True)

# Split data into X and y
X = df[['text']]
print(X.head)
y = df['label']

<bound method NDFrame.head of                                                    text
0     YouTube is making clear there will be no birth...
1     The increasingly bitter dispute between Americ...
2     So while there may be a humanitarian crisis dr...
3     A professor who teaches climate change classes...
4     Looking around the United States there is neve...
...                                                 ...
1695  In every case legislators are being swarmed by...
1696  Polls show the transgender ideology is deeply ...
1697  Democrats and Republicans stood and applauded ...
1698  As a selfdescribed Democratic socialist Sen Be...
1699  During the segment Colbert also bemoaned the f...

[1551 rows x 1 columns]>


In [3]:
#Split df into training and testing sets
train, test = train_test_split(df, test_size=0.15, random_state=42)

In [8]:
%pip install -q transformers

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Note: you may need to restart the kernel to use updated packages.


In [9]:
def convert_example_to_feature(entry, max_length=512):
  return tokenizer.encode_plus(entry,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                truncation=True,
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

In [4]:
batch_size = 16

In [5]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

In [6]:
def encode_entries(ds, limit=-1):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
      ds = ds.take(limit)
  print(ds)
  for text, label in ds.to_numpy():
    bert_input = convert_example_to_feature(text)
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [10]:
# train dataset
training_encoded = encode_entries(train).shuffle(100).batch(batch_size)
# test dataset
testing_encoded = encode_entries(test).batch(batch_size)

                                                   text  label
1457  Many conservatives came to Hawleys defense sla...      0
269   Joe Biden appeared to suffer trouble with his ...      0
1212  It can take hard work creativity perseverance ...      1
450   Samoa on Thursday closed all nonessential publ...      1
454   Schlapps apology comes as the US is convulsed ...      1
...                                                 ...    ...
1253  He even wants a son to get a background check ...      0
1428  South Bend Mayor Pete Buttigieg a Democratic c...      1
949   Hundreds of white nationalists lined the stree...      0
1601  However there is evidence suggesting that havi...      1
1249  It may have taken Trump to point out the pitfa...      1

[1318 rows x 2 columns]




                                                   text  label
1666  Spencer speculated that the media may be soull...      1
683   Officials at Dartmouth College looked the othe...      1
385   As the worlds scientists and pharmaceutical co...      1
386   Though it was Major League Baseball that he me...      1
561   The fact that the abortion rate among American...      1
...                                                 ...    ...
318   Here are three obvious but important ways the ...      1
617   In an era in which domestic terrorism is an in...      1
1303  George Washington University GWs Parliamentary...      0
377   It is hard to argue that the highest income co...      1
1156  A Gallup poll reports that women are still mor...      0

[233 rows x 2 columns]




In [11]:
from transformers import TFBertForSequenceClassification, AutoTokenizer
learning_rate = 2e-5
number_of_epochs = 2
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Change dropout rate
model.bert.dropout = tf.keras.layers.Dropout(0.2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
print(model.bert.dropout)

<keras.layers.core.dropout.Dropout object at 0x00000273211A1ED0>


In [15]:
# Choose Adam as an optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# Use sparse categorical cross entropy as loss function and sparse categorical accuracy as metric
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [16]:
bert_history = model.fit(training_encoded, epochs=number_of_epochs, validation_data=testing_encoded)

Epoch 1/2


KeyboardInterrupt: 

In [17]:
# Test the model
test_loss, test_acc = model.evaluate(testing_encoded)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

 1/15 [=>............................] - ETA: 3:28 - loss: 0.7259 - accuracy: 0.3750

KeyboardInterrupt: 

In [1]:
inputs = tokenizer("This is a random sentence to test the prediction.", return_tensors="tf")

logits = model(**inputs).logits

print(logits)
predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
print(predicted_class_id)

NameError: name 'tokenizer' is not defined

In [59]:
# Run eagerly is needed for the predictions of the model to work
tf.config.run_functions_eagerly(True)

In [84]:
# Show heatmap of test data
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

y_pred = model.predict(testing_encoded)



In [2]:
print(y_pred)
y_pred_bool = np.argmax(y_pred[0], axis=1)
plot_confusion_matrix(test['label'], y_pred_bool, classes=['0', '1'], normalize=True, title='Normalized confusion matrix')
plt.show()

NameError: name 'y_pred' is not defined

In [121]:
# Create keras prediction function
tf.data.experimental.enable_debug_mode()
# predictionInput = pd.DataFrame(["The president is a big fat liar.", "This a is a factual bit of text."], columns=['sentence'])
predictionInput = ["YouTube is making clear there will be no “birtherism” on its platform during this year’s U.S. presidential election – a belated response to a type of conspiracy theory more prevalent in the 2012 race."]
print(predictionInput)
classes = ['non-biased', 'biased']

def keras_predict(model, tokenizer, sentence):
    encoded_sentence = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512, padding=False, return_attention_mask=True, return_tensors='tf')
    input_ids = encoded_sentence['input_ids']
    attention_masks = encoded_sentence['attention_mask']
    token_type_ids = encoded_sentence['token_type_ids']
    prediction = model.predict([input_ids, attention_masks, token_type_ids])
    return prediction

prediction = keras_predict(model, tokenizer, predictionInput)
print((prediction[0][0]))
print(np.argmax(prediction[0][0]))


['YouTube is making clear there will be no “birtherism” on its platform during this year’s U.S. presidential election – a belated response to a type of conspiracy theory more prevalent in the 2012 race.']
[0.1783105 0.2375525]
1


In [4]:
#Test prediciton on multiple sentences

predictionInput = pd.DataFrame(["The president is a big fat liar.", "This a is a factual bit of text."], columns=['sentence'])
print(predictionInput)
classes = ['non-biased', 'biased']
pred = []
for entry in predictionInput['sentence']:
  encoded_sentence = tokenizer.encode_plus(entry, add_special_tokens=True, max_length=512, padding=False, return_attention_mask=True, return_tensors='tf')
  input_ids = encoded_sentence['input_ids']
  attention_masks = encoded_sentence['attention_mask']
  token_type_ids = encoded_sentence['token_type_ids']
  prediction = model.predict([input_ids, attention_masks, token_type_ids])
  print(prediction)
  pred.append(np.argmax(prediction[0][0]))
  # Get keras tensor attention mask and input ids

print(pred)

NameError: name 'pd' is not defined

In [None]:
model.save('./bias_model', save_format='tf')