In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from DataHandler.data import  *
from DataHandler.mapping import  *
from ModelCode.model import *
from apiconfig import project_name,api_token
import neptune.new as neptune
import GPUtil
import argparse
import json
import random
import os
import time
import torch


from ModelCode.model import *

from lime.lime_text import LimeTextExplainer



In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertForTokenClassification, BertForSequenceClassification,BertPreTrainedModel, BertModel
from transformers import AutoTokenizer, BertTokenizer, RobertaTokenizer
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel,RobertaModel




In [None]:
params={
  'dataset':'toxic_unintended',
  'model':'cardiffnlp/twitter-roberta-base-hate',
  'features':'tfidf',
  'cache_path':'../../Saved_models/',
  'model_path':'cardiffnlp/twitter-roberta-base-hate',
  'train_batch_size':16,
  'val_batch_size':32,
  'max_length':256,
  'learning_rate':5e-5,  ### learning rate 2e-5 for bert 0.001 for gru
  'weight_decay':1e-5,
  'epsilon':1e-8,
  'epochs':3,
  'dropout':0.2,
  'random_seed':2021,
  'device':'cuda',
  'save_path':'Saved_Models/',
  'logging':'local'
}

In [1]:
class modelPred_lime():
    def __init__(self):
        self.device = torch.device("cuda")
        
        class_weights=torch.tensor([1.0,1.0,1.0]).to(self.device)
   
        if('roberta' in params['model_path']):
            model = RobertaForRegression.from_pretrained(
                    params['model_path'], # Use the 12-layer BERT model, with an uncased vocab.
                    cache_dir=params['cache_path'],
                    params=params).to(self.device)
            
        elif('Hate-speech-CNERG/dehatebert-mono-english' in params['model_path']):
            model = HateAlert.from_pretrained(
                    params['model_path'], 
                    cache_dir=params['cache_path'],
                    params=params).to(self.device)

        else:
            model = BertForRegression.from_pretrained(
                    params['model_path'], # Use the 12-layer BERT model, with an uncased vocab.
                    cache_dir=params['cache_path'],
                    params=params).to(self.device)


        if('roberta' in params['model_path']):
            tokenizer = RobertaTokenizer.from_pretrained(params['model_path'])
        
        elif('dehatebert' in params['model_path']):
            tokenizer = AutoTokenizer.from_pretrained(params['model_path'])
        
        else:
            tokenizer = BertTokenizer.from_pretrained(params['model_path'])

        # print("Model Loaded!")
        # self.model.cuda()  
        # self.model.eval()

    # def preprocess_func(self, text):
    #     remove_words=['<allcaps>','</allcaps>','<hashtag>','</hashtag>','<elongated>','<emphasis>','<repeated>','\'','s']
    #     word_list=text_processor.pre_process_doc(text)
    #     word_list=list(filter(lambda a: a not in remove_words, word_list)) 
    #     sent=" ".join(word_list)
    #     sent = re.sub(r"[<\*>]", " ",sent)
    #     return sent

    def predict(self, model, dataloader, device):
        predicted_label = []
        actual_label = []
        self.model.eval()
        with torch.no_grad():
            for step,data in tqdm(enumerate(dataloader, 0), total=len(dataloader)):
                input_ids = data['ids'].to(device, dtype = torch.long)
                attention_mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float32)
                targets = targets.unsqueeze(1)

    #             input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
                output = model(input_ids, attention_mask)
                            
                predicted_label += output[0]
                actual_label += targets
                
        return predicted_label

    def prediction(self, sentences):

        test = pd.DataFrame()
        test['text'] = sentences
        test['labels'] = 0

        testing_set = Triage(test, self.tokenizer, MAX_LEN = 256)

        test_params = {'batch_size': 1,
                       'shuffle': False,
                        'num_workers': 0
                        }

        test_loader = DataLoader(testing_set, **test_params)

        output = self.predict(self.model, test_loader, self.device)

        out2 =[]
        for out in output:
            out2.append(out.cpu().detach().numpy())

        out = np.array(out2).reshape(len(out2))
        
        return out


    # def tokenize(self, sentences, padding = True, max_len = 128):
    #     input_ids, attention_masks, token_type_ids, rationales = [], [], [], []
    #     # self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, use_fast = False)
    #     self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    #     for sent in sentences:
    #         encoded_dict = self.tokenizer.encode_plus(sent,
    #                                                 add_special_tokens=True,
    #                                                 max_length=max_len, 
    #                                                 padding='max_length', 
    #                                                 return_attention_mask = True,
    #                                                 return_tensors = 'pt', 
    #                                                 truncation = True)
    #         input_ids.append(encoded_dict['input_ids'])
    #         attention_masks.append(encoded_dict['attention_mask'])
        
    #     input_ids = torch.cat(input_ids, dim=0)
    #     attention_masks = torch.cat(attention_masks, dim=0)
    #     return {'input_ids': input_ids, 'attention_masks': attention_masks}
    

In [None]:
modelClass=modelPred_lime(params)




In [None]:
modelClass.prediction(['I am good'])


In [None]:
# explainer = LimeTextExplainer(class_names=['normal','fearspeech','hatespeech'],split_expression='\s+',random_state=333,bow=False)
