# Natural Language Processing Project: Predicting Clickbait with BERT and LEAD3

Inspired by: https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

# Config

In [None]:
# How many tweets to use
BATCH_SIZE = 600

# Max length a tweet can be
MAX_TWEET_LENGTH = 140

# Max length BERT can handle
BERT_MAX_LENGTH = 512

# Which BERT version
# - BERT
# - DISTILBERT
BERT_MODEL = 'DISTILBERT'

# Use lead3
LEAD3 = True

# Split the batch into train:test randomly
RANDOM_SAMPLE_SPLIT = False


# Import modules and data

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import sklearn.metrics as skm
import torch
import transformers as ppb
import random
import copy
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Colab:
# from google.colab import drive
# drive.mount('/content/drive')

# prefix = 'drive/MyDrive/clickbait/'

# Local:
prefix = ''

# Import files

In [None]:
with open(prefix + 'clickbait17-validation-170630/instances.jsonl', 'r') as instances_file:
    instances_list = list(instances_file)

idx = 0
instances = []
ids = []
tweets = []

# lead3
lead3s = []
separator = ". "

for json_str in instances_list:
    if (idx >= BATCH_SIZE):
        break
    my_json = json.loads(json_str)
    instances.append(my_json)
    ids.append(my_json['id'])
    tweets.append(my_json['postText'])
    if LEAD3:
        temp = []
        lead = 0
        for i in range(min(len(my_json['targetParagraphs']), 3)):
            text_arr = my_json['targetParagraphs'][i].split(separator)
            for t in text_arr:
                suf = separator
                if len(t) > 0 and t[-1] == '.':
                    suf = suf[1]
                temp.append(t+suf)
                lead += 1
                if lead >= 3:
                    break
        while len(temp) < 3:
            temp.append('')
        new_entry = (temp[0] + temp[1] + temp[2])[:BERT_MAX_LENGTH]
        lead3s.append(new_entry)
    idx += 1


In [None]:
with open(prefix + 'clickbait17-validation-170630/truth.jsonl', 'r') as truths_file:
    truths_list = list(truths_file)

idx = 0
truths = []
for json_str_instance in instances_list:
    if (idx >= BATCH_SIZE):
        break

    my_json_instance = json.loads(json_str_instance)
    jdx = 0
    for json_str_truth in truths_list:
        my_json_truth = json.loads(json_str_truth)

        if (my_json_truth['id'] == my_json_instance['id']):
            truths.append(my_json_truth)
            break
        jdx += 1

    idx += 1
    

print("Truths loaded: " + str(len(truths)))

In [None]:
labels = []

for item in truths:
    labels.append((item['truthMean']-0.5)*2)  # Translate labels to [-1,1] range.

# BERT



## Setup

In [None]:
print("Chosen BERT model:", BERT_MODEL)

if BERT_MODEL == "BERT":
    # For BERT
    model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
elif BERT_MODEL == "DISTILBERT":
    # For DistilBERT
    model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
else:
    raise Exception("Invalid BERT model!")

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

## Tokenize


In [None]:
tokenized = []
tokenized_lead3 = []
for instance in instances:
    tokenized.append(tokenizer(instance['postText'], add_special_tokens=True))

if LEAD3:
  for li in lead3s:
    tokenized_lead3.append(tokenizer(li, add_special_tokens=True))

## Add padding

In [None]:
padded = np.array([])
padded_lead3 = np.array([])

idx = 0
for item in tokenized:
    result = []
    token_array = item['input_ids'][0]
    for token in token_array:
        result.append(token)
    # Padding step
    for i in range(MAX_TWEET_LENGTH - len(token_array)):
        result.append(0)

    # Buggy code appending
    if (idx == 0):
        padded = [result]
    else:
        padded = np.append(padded, [result], axis=0)

    idx += 1

if LEAD3:
    idx = 0
    for item in tokenized_lead3:
        result = []
        token_array = item['input_ids']
        for token in token_array:
            result.append(token)
        # Padding step
        for i in range(BERT_MAX_LENGTH - len(token_array)):
            result.append(0)

        # Buggy code appending
        if (idx == 0):
            padded_lead3 = [result]
        else:
            padded_lead3 = np.append(padded_lead3, [result], axis=0)
        idx += 1

print(padded.shape)

if LEAD3:
    print(padded_lead3.shape)

## Mask the padding for BERT

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)

if LEAD3:
    attention_mask_lead3 = np.where(padded_lead3 != 0, 1, 0)
    print(attention_mask_lead3.shape)


## BERT learns
**This takes a long time (few minutes on 200 tweets)**

In [None]:
if LEAD3:
    input_ids_lead3 = torch.tensor(padded_lead3)
    attention_mask_lead3 = torch.tensor(attention_mask_lead3)

    with torch.no_grad():
        last_hidden_states_lead3 = model(input_ids_lead3, attention_mask_lead3)

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)


In [None]:
features = last_hidden_states[0][:,0,:].numpy()
print(features.shape)

if LEAD3:
    features_lead3 = last_hidden_states_lead3[0][:,0,:].numpy()
    print(features_lead3.shape)

# Regression


## Split 


In [None]:
####### CONFIGURATION #######
ratio = 0.25 # test set ratio
#############################

# Init lists for test set.
test_features, test_features_lead3, test_labels, test_ids, test_tweetText = np.ndarray((0, 768)), np.ndarray((0, 768)), [], [], []
tweetsCopy = tweets.copy()

# Init lists for training set.
train_features = copy.deepcopy(features)
if LEAD3:
    train_features_lead3 = copy.deepcopy(features_lead3)
train_labels = copy.deepcopy(labels)

print("Splitting " + str(len(train_labels)) + " entities with split ratio " + str(ratio) + "...")

for i in range(int(ratio * len(train_labels))):
    # Select a random index from the training label array.
    if RANDOM_SAMPLE_SPLIT:
      index = random.randint(0, len(train_labels) - 1)
    else:
      index = i

    # Move features at index to test features.
    test_features = np.append(test_features, [train_features[index]], 0)
    test_tweetText = np.append(test_tweetText, tweetsCopy[index], 0)
    train_features = np.delete(train_features, index, 0)
    tweetsCopy = np.delete(tweetsCopy, index, 0)
    
    if LEAD3:
        test_features_lead3 = np.append(test_features_lead3, [train_features_lead3[index]], 0)
        train_features_lead3 = np.delete(train_features_lead3, index, 0)

    # Move label at index to test labels.
    test_labels.append(train_labels[index])
    train_labels.pop(index)

    # Construct id list for output in same order as labels and features. 
    test_ids.append(ids[index])

# Concat the features
if LEAD3:
    train_features = np.concatenate((train_features, train_features_lead3), axis=1)
    test_features = np.concatenate((test_features, test_features_lead3), axis=1)

print('train_features.shape', train_features.shape)
print('test_features.shape', test_features.shape)

print("Succeeded in splitting dataset of length " + str(len(labels)) + " into training and testing sets. Constructed list of " + str(len(train_labels)) + " and " + str(len(test_labels)) + " respectively.")

## Fit regressors

In [None]:
##################### CONFIG ##########################
SIG = '.20f' # Output format and significant decimals
TO_RUN = "ALL"
OPTIMIZE_HYPERPARAMS = False
#######################################################

###### DEFINITIONS ######
REMAP_SIZE_FACT = 2
PERCENTAGE_MULT = 100
#########################

# Import models
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.model_selection import RepeatedKFold, RandomizedSearchCV
from scipy.stats import loguniform, uniform 
# Run a model
def run_model(modelName, train_labels, train_features, test_labels, test_features, avgTop, writeToArray=True):
    if OPTIMIZE_HYPERPARAMS:
        print("Started optimization")
    model = models[modelName](train_features, train_labels)
    if OPTIMIZE_HYPERPARAMS:
        print('Best Score: %s' % model.best_score_)
        print('Best Hyperparameters: %s' % model.best_params_)

    # Init sumval and maxval variables.
    sumval, maxval = 0, 0
  
    # Keep track of total count.
    count = 0

    # Init prediction
    prediction = []
    predictionsLoc = []
    truthsLoc = []

    for i in range(len(test_features)):
        truth = test_labels[i]
        input = [test_features[i]]
        prediction = model.predict(input)
        diff = abs(prediction-truth)[0]
        sumval+= diff
        maxval=max(abs(prediction-truth)[0],maxval)
        if writeToArray:
            truthsLoc.append(truth)
            predictionsLoc.append(prediction[0])
        count+=1
    div = divs/count
    avg = sumval/count/2*100
    print('{:>14}'.format(modelName) + ":\t\t" + str(format(avg, SIG))+"% \tmaxval: " + str(format(maxval/2*100, SIG))+"%")
    if avg < avgTop:
      return [avg, truthsLoc, predictionsLoc]
    return []

# Init prediction and truth list
predictions = []
truths = []
avgTop = 101.0;

cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
booleanOptions = [True, False]
solverOptions = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
alphaOptions = loguniform(1e-5, 100)
maxIterOptions = [500*i for i in range(1,21)]
lossOptions = ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
penaltyOptions = ['l2', 'l1', 'elasticnet']
l1ROptions = [i * 0.06 for i in range(16) ]
epsilonOptions = [0.001+i * 0.02 in range(16)]
learningRateOptions = ['constant','optimal','invscaling']
eta0Options = [0.001, 0.01, 0.1]
powerTOptions = [0.05+0.1*i for i in range(6)]
nJobs = [-1]
lossOptions2 = ['ls','lad','huber','quantile']
learningRateOptions2 = [0.01, 0.05, 0.1, 0.2]
estimatorOptions = [50, 100, 150, 200, 250]
subsampleOptions = [0.8, 0.9, 1]
criterionOptions = ['friedman_mse', 'mse']
maxDepthOptions = [2,3,4,5]
maxFeaturesOptions = ['auto', 'sqrt', 'log2']
learningRateOptions3 = [0.5,0.75,1]
lossOptions3 = ['linear','square','exponential']
bootstrapSampleF = [0.1,0.2,0.3]

linearSpace, ridgeSpace, ridgeCVSpace, SGDSpace, GBSpace, RFSpace, ABSpace, ETSpace = [dict() for i in range(8)]
linearSpace['fit_intercept'],ridgeSpace['fit_intercept'],ridgeCVSpace['fit_intercept'],SGDSpace['fit_intercept'] = [booleanOptions for i in range(4)]
linearSpace['normalize'], ridgeSpace['normalize'], ridgeCVSpace['normalize']  = [booleanOptions for i in range(3)]
linearSpace['n_jobs'] = nJobs
# linearSpace['positive'] = booleanOptions
ridgeSpace['solver'] = solverOptions
ridgeSpace['alpha'], SGDSpace['alpha'],GBSpace['alpha'] = [alphaOptions for i in range(3)]
ridgeSpace['max_iter'],SGDSpace['max_iter'] = [maxIterOptions for i in range(2)]
SGDSpace['loss'] = lossOptions
SGDSpace['penalty'] = penaltyOptions
SGDSpace['l1_ratio'] = l1ROptions
SGDSpace['shuffle'] = booleanOptions
SGDSpace['epsilon'] = epsilonOptions
SGDSpace['eta0'] = eta0Options
SGDSpace['power_t'] = powerTOptions
SGDSpace['warm_start'] = booleanOptions
SGDSpace['learning_rate'] = learningRateOptions
GBSpace['loss'] = lossOptions2
GBSpace['learning_rate'] = learningRateOptions2
GBSpace['n_estimators'] = estimatorOptions
GBSpace['subsample'] = subsampleOptions
GBSpace['criterion'] = criterionOptions
GBSpace['max_depth'] = maxDepthOptions
GBSpace['max_features'] = maxFeaturesOptions
GBSpace['warm_start'] = booleanOptions
RFSpace['n_estimators'] = estimatorOptions
RFSpace['max_features'] = maxFeaturesOptions
RFSpace['bootstrap'] = booleanOptions
RFSpace['warm_start'] = booleanOptions
ABSpace['n_estimators'] = estimatorOptions
ABSpace['learning_rate'] = learningRateOptions3
ABSpace['loss'] = lossOptions3
ETSpace['n_estimators'] = estimatorOptions
ETSpace['max_features'] = maxFeaturesOptions
ETSpace['bootstrap'] = booleanOptions
ETSpace['warm_start'] = booleanOptions
ETSpace['max_samples'] = bootstrapSampleF

# Compute divs.
divs = PERCENTAGE_MULT/REMAP_SIZE_FACT
if OPTIMIZE_HYPERPARAMS:
    models = {'LINEAR': RandomizedSearchCV(LinearRegression(),linearSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit,
    'RIDGE': RandomizedSearchCV(Ridge(), ridgeSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit,
    # 'RIGDECV': RandomizedSearchCV(RidgeCV(), ridgeCVSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit,
    'SGD': RandomizedSearchCV(SGDRegressor(), SGDSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit,
    'GB': RandomizedSearchCV(GradientBoostingRegressor(), GBSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit,
    'RF': RandomizedSearchCV(RandomForestRegressor(), RFSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit,
    'AB': RandomizedSearchCV(AdaBoostRegressor(), ABSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit,
    'ET': RandomizedSearchCV(ExtraTreesRegressor(), ETSpace, n_iter=500, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv, random_state=1, verbose=3 ).fit}
else:
    models = {
    'LINEAR': LinearRegression(normalize=True, n_jobs=-1, fit_intercept=False).fit,
    'LINEARDEFAULT': LinearRegression().fit,
    'RIDGE': Ridge(alpha=23.830583569850845, fit_intercept=False, max_iter=7000, normalize=True, solver='lsqr').fit,
    'RIDGEDEFAULT': Ridge().fit,
    # 'RIGDECV': RidgeCV(normalize=True, fit_intercept=True).fit,
    'SGD': SGDRegressor(alpha=0.0002894244244084805, epsilon=False, eta0=0.001, fit_intercept=False, l1_ratio=0.12, learning_rate='invscaling', loss='squared_epsilon_insensitive', max_iter=7000, penalty='l1', power_t=0.15000000000000002, shuffle=True, warm_start=True).fit,
    'SGDDEFAULT': SGDRegressor().fit,
    'GB': GradientBoostingRegressor(alpha=1.565766957731208e-05, criterion='mse', learning_rate=0.1, loss='ls', max_depth=2, max_features='log2', n_estimators=200, subsample=1, warm_start=False).fit,
    'GBDEFAULT': GradientBoostingRegressor().fit,
    'RF': RandomForestRegressor(warm_start=True, n_estimators=150, max_features='auto', bootstrap=True).fit,
    'RFDEFAULT': RandomForestRegressor().fit,
    'AB': AdaBoostRegressor(n_estimators=200, loss='linear', learning_rate=1).fit,
    'ABDEFAULT': AdaBoostRegressor().fit,
    'ET': ExtraTreesRegressor(warm_start=False, n_estimators=150, max_samples=0.3,max_features='auto', bootstrap=False).fit,
    'ETDEFAULT': ExtraTreesRegressor().fit}
    

if (TO_RUN == "ALL"):    # Handle 'ALL' execution case, for finding the results of all methods.
    for modelName in models:
      resTemp = []
      resTemp = run_model(modelName, train_labels, train_features, test_labels, test_features, avgTop)
      if len(resTemp) > 0:
        avgTop = resTemp[0]
        truths = resTemp[1]
        predictions = resTemp[2]
else:     # Handle case in which a specific model is specified.
    avgTop, truths, predictions = run_model(TO_RUN, train_labels, train_features, test_labels, test_features, avgTop)
print("Used model with average: " + str(avgTop)+"%")

## Evaluate

In [None]:
########### CONFIG ##########
THRESHOLD_CLICKBAIT = 0.5; # FOR ORIGINAL DOMAIN
#############################

######### CONSTANTS #########
UNDERLINE = '\033[4m'
END = '\033[0m'
#############################

truths = [i / 2 + 0.5 for i in truths]
predictions = [i / 2 + 0.5 for i in predictions]

def normalized_mean_squared_error(truth, predictions):
    norm = skm.mean_squared_error(truth, np.full(len(truth), np.mean(truth)))
    print('norm', norm)
    return skm.mean_squared_error(truth, predictions) / norm


regression_measures = {'Explained variance': skm.explained_variance_score,
                       'Mean absolute error': skm.mean_absolute_error,
                       'Mean squared error': skm.mean_squared_error,
                       'Median absolute error': skm.median_absolute_error,
                       'R2 score': skm.r2_score,
                       'Normalized mean squared error': normalized_mean_squared_error}

classification_measures = {'Accuracy': skm.accuracy_score,
                           'Precision': skm.precision_score,
                           'Recall': skm.recall_score,
                           'F1 score': skm.f1_score}

print(UNDERLINE + '\nDataset Stats' + END)
print('Size: ' +  str(len(truths)))
sum_clickbait = sum(1 for x in truths if x > THRESHOLD_CLICKBAIT)
print('#Clickbait', sum_clickbait)
print('#No-Clickbait', len(truths) - sum_clickbait)
print(UNDERLINE + '\nRegression scores' + END)
for name in regression_measures:
    print(name, regression_measures[name](truths, predictions))
print(UNDERLINE + '\nBinary classification scores' + END)
classes = [0 if t <= THRESHOLD_CLICKBAIT else 1 for t in truths]
predictionsDiscrete= [0 if t <= THRESHOLD_CLICKBAIT else 1 for t in predictions]
for name in classification_measures:
    print(name, classification_measures[name](classes, predictionsDiscrete))
print(UNDERLINE + '\nClassification report' + END)
print(skm.classification_report(classes, predictionsDiscrete))

In [None]:
GradientBoostingRegressor().get_params().keys()

In [None]:
topIndices = [[-1,0, 0],[-1,0, 0],[-1,0, 0],[-1,0, 0],[-1,0, 0],[-1,0, 0],[-1,0, 0],[-1,0, 0],[-1,0, 0],[-1,0, 0]]
bottomIndices = [[-1,99999999999999999999, 0],[-1,99999999999999999999, 0],[-1,99999999999999999999,0],[-1,99999999999999999999,0],[-1,99999999999999999999,0],[-1,99999999999999999999,0],[-1,99999999999999999999,0],[-1,99999999999999999999,0],[-1,99999999999999999999,0],[-1,99999999999999999999,0]]
lowestTopI = 0
highestBottomI = 0
for i in range(len(predictions)):
  if abs(predictions[i]-truths[i]) > abs(topIndices[lowestTopI][1]-topIndices[lowestTopI][2]):
    topIndices[lowestTopI] = [i,predictions[i],truths[i]]
    for j in range(len(topIndices)):
      if abs(topIndices[j][1]-topIndices[j][2]) < abs(topIndices[lowestTopI][1]-topIndices[lowestTopI][2]):
        lowestTopI = j
  if abs(predictions[i]-truths[i]) < abs(bottomIndices[highestBottomI][1]-bottomIndices[highestBottomI][2]):
    bottomIndices[highestBottomI] = [i,predictions[i],truths[i]]
    for j in range(len(bottomIndices)):
      if abs(bottomIndices[j][1]-bottomIndices[j][2]) > abs(bottomIndices[highestBottomI][1]-bottomIndices[highestBottomI][2]):
        highestBottomI = j
    
for i in range(len(topIndices)):
  temp1, temp2, temp3 = topIndices[i]
  topIndices[i] = [temp1, test_tweetText[temp1], temp2, temp3, abs(temp2-temp3)]
for i in range(len(bottomIndices)):
  temp1, temp2, temp3 = bottomIndices[i]
  bottomIndices[i] = [temp1, test_tweetText[temp1], temp2, temp3, abs(temp2-temp3)]

In [None]:
for topIndice in topIndices:
  index, tweet, prediction, truth, difference = topIndice
  print("Tweet " + str(index) + ": | " + str(tweet) + " | was prediced to be " + str(prediction/2+0.5) + " yet was in truth " + str(truth/2+0.5) + " which differs by " + str(difference/2))

In [None]:
for bottomIndice in bottomIndices:
  index, tweet, prediction, truth, difference = bottomIndice
  print("Tweet " + str(index) + ": | " + str(tweet) + " | was prediced to be " + str(prediction/2+0.5) + " yet was in truth " + str(truth/2+0.5) + " which differs by " + str(difference/2))