In [2]:
#Simple transformers is a wrapper library around the original transformers library.
!pip install simpletransformers

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/53/3f/0891d5db3a9f94c34ae9a1763dfec02dabe71ca16d55d02e2c6c24d01ef7/simpletransformers-0.50.0-py3-none-any.whl (220kB)
[K     |████████████████████████████████| 225kB 13.2MB/s eta 0:00:01
[?25hCollecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 8.1MB/s  eta 0:00:01
Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 11.7MB/s 
[?25hCollecting tqdm>=4.47.0
[?25l  Downloading https://files.pythonhosted.org/packages/8a/54/115f0c28a61d56674c3a5e05c46d6c3523ad196e1dcd3e2d8b119026df36/tqdm-4.54.1-py2.py3-none-any.whl (69kB)
[K    

In [3]:
# General imports
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle


# Modelling Imports
import sklearn
import sklearn.metrics
from sklearn.metrics import classification_report
import sklearn.preprocessing
from sklearn.model_selection import train_test_split

from simpletransformers.classification import ClassificationModel

# Useful relative paths
data_directory = './data/NELA'
model_directory ='./models'

# General note: the inclusion of this model is more a reference point than an experiment. We aim to verify if SOTA model are applicable to the problem and what is its "raw" perfomance. There is certainly a lot to build on this section.

# 1. – Modelling! This section draws from the [HuggingFace documentation](https://huggingface.co/transformers/model_doc/electra.html) and the  [simpletransformers documentation and examples](https://simpletransformers.ai/docs/installation/)<br> It was trained on Collab and needs a GPU to run.

## 1.1 Lets make data a bit more BERT/Electra friendly

In [None]:
isGPU = torch.device(True if torch.cuda.is_available() else False)
if !isGPU:
    print('!Using CPU! Aint nobody got time for that!')

In [6]:
X = pd.read_csv('{}/complete_processed.csv'.format(data_directory))

In [7]:
X=X[['full_preprocessed','label']]
y=X['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [9]:
X_train.to_csv(('{}/BERT_data/train.csv'.format(data_directory)),index=False)
X_test.to_csv(('{}/BERT_data/test.csv'.format(data_directory)),index=False)

In [10]:
model_type='electra'
model_name='google/electra-small-discriminator'

In [12]:
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/{model_type}",
    "best_model_dir": f"outputs/{model_type}/best_model",
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 3,
    "evaluate_during_training_steps": 1000,
    "save_model_every_epoch": True,
    "save_eval_checkpoints": True,
    "train_batch_size": 64,
    "eval_batch_size": 32,
    "use_cuda": isGPU,
}


In [13]:
model = ClassificationModel(model_type, model_name, args=train_args)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=466.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=54245363.0), HTML(value='')))




Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['pooler.den

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




In [14]:
train_df = pd.read_csv('{}/BERT_data/train.csv'.format(data_directory))
eval_df = pd.read_csv('{}/BERT_data/test.csv'.format(data_directory))

 Warnings below are ok, columns are in this order

In [15]:
model.train_model(train_df, eval_df=eval_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=35183.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 3'), FloatProgress(value=0.0, max=550.0), HTML(value='')))




  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(HTML(value='Running Epoch 1 of 3'), FloatProgress(value=0.0, max=550.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 2 of 3'), FloatProgress(value=0.0, max=550.0), HTML(value='')))





(1650,
 {'eval_loss': [0.3834316757592288,
   0.3477488280968233,
   0.3409164823185314,
   0.3354724592783234],
  'fn': [930, 845, 886, 646],
  'fp': [594, 515, 450, 623],
  'global_step': [550, 1000, 1100, 1650],
  'mcc': [0.6552407363386015,
   0.6925867870254788,
   0.6995018388958103,
   0.711452656561507],
  'tn': [3828, 3907, 3972, 3799],
  'tp': [3444, 3529, 3488, 3728],
  'train_loss': [0.33131229877471924,
   0.283263623714447,
   0.30301621556282043,
   0.1706138551235199]})

## About 85.76 % Accuracy