In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Install dependencies**

In [2]:
!pip install pytorch-pretrained-bert pytorch-nlp

!pip install sentence-transformers

!pip install keras

!pip install awscli --ignore-installed six

!pip install spacy ftfy==4.4.3
!python -m spacy download en

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[?25l[K     |██▋                             | 10 kB 21.1 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 9.6 MB/s eta 0:00:01[K     |████████                        | 30 kB 8.4 MB/s eta 0:00:01[K     |██████████▋                     | 40 kB 7.7 MB/s eta 0:00:01[K     |█████████████▎                  | 51 kB 5.0 MB/s eta 0:00:01[K     |███████████████▉                | 61 kB 5.5 MB/s eta 0:00:01[K     |██████████████████▌             | 71 kB 5.5 MB/s eta 0:00:01[K     |█████████████████████▏          | 81 kB 6.1 MB/s eta 0:00:01[K     |███████████████████████▉        | 92 kB 6.1 MB/s eta 0:00:01[K     |██████████████████████████▌     | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████████████▊| 122 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████████| 1

Collecting ftfy==4.4.3
  Downloading ftfy-4.4.3.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 3.1 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Using cached urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-4.4.3-py3-none-any.whl size=41082 sha256=76cd57c20bd699a3a626c4889503a8a95c3b7c202d813a926a30cc2a5367a987
  Stored in directory: /root/.cache/pip/wheels/b0/66/08/c65b9e8a3b674f10739790db0cbbc846afaa20a3f80f0b9e42
Successfully built ftfy
Installing collected packages: urllib3, ftfy
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.7
    Uninstalling urllib3-1.26.7:
      Successfully uninstalled urllib3-1.26.7
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
go

In [3]:
import torch
import numpy as np
from pytorch_pretrained_bert import BertTokenizer, BertConfig, OpenAIGPTModel, OpenAIGPTTokenizer
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from transformers import pipeline

from keras.preprocessing.sequence import pad_sequences

**Load the pre-trained model weights from Google Colab directory**

In [4]:
def load_model(file_str): # path_to_model -> pytorch model
    # Load model file and return the model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
    model.load_state_dict(torch.load(file_str))
    model.eval()
    return model

**Map the predicted class to the corresponding label**

In [5]:
def map(prediction):
  return {2 : "irrelevant", 1 : "misinformation", 0 : "legitimate information"}[prediction]

**Pass in sample text to classify**

In [6]:
def inference(tweet_txt, model): # tweet_string -> label_string
  tweet_txt = np.array([tweet_txt])
  test_sentence = ["[CLS] " + str(text) + " [SEP]" for text in tweet_txt]
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
  tokenized_text = [tokenizer.tokenize(text) for text in test_sentence]

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
  input_ids = pad_sequences(input_ids, maxlen=90, dtype="long", truncating="post", padding="post")
  attention_masks = []
  test_inputs = torch.tensor(input_ids, dtype = torch.long)

  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
  
  prediction = []
  test_masks = torch.tensor(attention_masks, dtype = torch.long)
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(test_inputs, token_type_ids=None, attention_mask=test_masks)

  logits = logits.detach().cpu().numpy()
  prediction.append(logits)

  flat_predictions = [item for sublist in prediction for item in sublist]
  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

  label_predictions = []
  for pred in flat_predictions:
    label_predictions.append(map(pred))

  return label_predictions[0]

Current attempt to pipeline our model and a fake news detection task

In [None]:
from transformers import Pipeline

class FakeNewsClassificationPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
      preprocess_kwargs = {}
      if "fake-news-classification" in kwargs:
          preprocess_kwargs["fake-news-classification"] = kwargs["fake-news-classification"]
      return preprocess_kwargs, {}, {}

    def preprocess(self, inputs, maybe_arg=1):
      test_sentence = ["[CLS] " + str(text) + " [SEP]" for text in inputs]
      tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
      tokenized_text = [tokenizer.tokenize(text) for text in test_sentence]

      input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
      input_ids = pad_sequences(input_ids, maxlen=90, dtype="long", truncating="post", padding="post")
      attention_masks = []
      model_input = torch.tensor(input_ids, dtype = torch.long)
      return {"model_input": model_input}

    def _forward(self, model_inputs):
      # model_inputs == {"model_input": model_input}
      oututs = self.model(**model_inputs)
      # Maybe {"logits": Tensor(...)}
      return outputs

    def postprocess(self, model_outputs):
      best_class = model_outputs["logits"].softmax(-1)
      return best_class


In [None]:
def classify(tweet_txt, model, tokenizer):
  classification = FakeNewsClassificationPipeline("fake-news-classification", model = model, tokenizer = tokenizer)
  return classification(tweet_txt)

Loading in our pre-trained model from bert.bin

In [7]:
# edward_dir ='/content/drive/My Drive/CornellUniversity/CDS/ProjectX/Subprojects/bert.bin'
# melinda_dir = 'drive/MyDrive/ProjectX/Subprojects/bert.bin'
dir ='/content/drive/My Drive/CDS/ProjectX/Subprojects/bert.bin'
model = load_model(dir)

100%|██████████| 407873900/407873900 [00:11<00:00, 34133495.99B/s]


**Sample tweets that were not in the training, validation, or testing data sets**

In [8]:
tweet_example1 = "First stats for omicron in Israel: protection for vaccinated similar to Delta, twice as dangerous for unvaccinated"
print(inference(tweet_example1, model))

100%|██████████| 231508/231508 [00:00<00:00, 2097129.35B/s]


misinformation


In [9]:
tweet_example2 = "DP Dough is the best restaurant in collegetown"
print(inference(tweet_example2, model))

irrelevant


In [10]:
tweet_example3 = "Omicron Unlikely to Cause Severe Illness in Vaccinated People, BioNTech Founder Says"
print(inference(tweet_example3, model))

legitimate information


In [11]:
tweet_example4 = "Vaccines cause autism"
print(inference(tweet_example4, model))

misinformation


**Try your own input!**

In [13]:
tweet_example = "Vaccines are the best way to end a pandemic" #@param {type:"string"}
print(inference(tweet_example, model))

misinformation
