In [None]:
!pip install transformers



In [None]:
import re
import nltk
import argparse
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pickle

nltk.download('stopwords')

nltk.download('wordnet')
import torch
from transformers import RobertaModel, RobertaTokenizer


class Model:
    def __init__(self, text):
        self.text = text

    def download_dependencies(self):
        pass

    def process_text(self):
        pass

    def predict(self):
        pass

class ROBERTA(torch.nn.Module, Model):
    def __init__(self, text, dropout_rate=0.4):
        super(ROBERTA, self).__init__()
        self.text = text
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        self.roberta = RobertaModel.from_pretrained('roberta-base',return_dict=False, num_labels = 4)
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 4)
        
    def download_dependencies(self):
        try:
            nltk.data.find('stopwords', './nltk_data')
            nltk.data.find('wordnet', './nltk_data')
        except LookupError:
            nltk.download('stopwords', './nltk_data')
            nltk.download('wordnet', './nltk_data')

    def process_text(self):
        lemmatizer = WordNetLemmatizer()
        corpus = []

        for i in range(len(self.text)):
            review = re.sub('[^a-zA-Z]', ' ', self.text[i])
            review = review.lower()
            review = review.split()
            review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
            review = ' '.join(review)
            corpus.append(review)
        self.text = corpus


    def load_checkpoint(self,path, model):    
        state_dict = torch.load(path, map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])

        return state_dict['valid_loss']

    def predict(self):
        labels_output = []
        labels = ['false', 'true', 'partially false', 'other']
        roberta_encoded_dict = self.tokenizer.encode_plus(
                        self.text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        roberta_encoded_dict = roberta_encoded_dict.to(device)
        outputs = self(**roberta_encoded_dict)
        labels_output.append(labels[outputs.argmax()])
        return labels_output

    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        x = self.d2(x)
        x = self.l2(x)
        
        return x
    

def parse_args():
    parser = argparse.ArgumentParser(description='Fake News Classification')
    parser.add_argument('text', metavar='text', type=str, nargs='+', help='Text to be classified')
    args = parser.parse_args()
    return ' '.join(args.text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
torch.manual_seed(17)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

cuda:0


In [None]:
model = ROBERTA("Mark Zuckerberg is the owner of Facebook")
model.to(device)
model.load_checkpoint("/content/drive/MyDrive/model.pkl", model)
model.download_dependencies()
model.process_text()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
model.predict()[0]



'false'