In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from DataHandler.data import  *
from DataHandler.mapping import  *
from ModelCode.model import *
from apiconfig import project_name,api_token
import neptune.new as neptune
import GPUtil
import argparse
import json
import random
import os
import time
import torch

from ModelCode.model import *

from torch.utils.data import Dataset, DataLoader
from transformers import BertForTokenClassification, BertForSequenceClassification,BertPreTrainedModel, BertModel
from transformers import AutoTokenizer, BertTokenizer, RobertaTokenizer
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel,RobertaModel


In [7]:
params={
  'dataset':'Dataset/jigsaw-toxic-severity-rating/validation_data.csv',
  'model':'twitter-roberta-base-hate_reddit',
  'cache_path':'Saved_Models/twitter-roberta-base-hate_reddit',
#   'model_path':'bert-base-uncased_toxic_comment',
  'max_length':256,
  'dropout':0.2,
  'random_seed':2021,
  'device':'cuda',
  'save_path':'Saved_Models/',
  'logging':'local'
}

In [8]:
model_memory=1
total_memory=16
def get_gpu(gpu_id):
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    while(1):
        tempID = [] 
        tempID = GPUtil.getAvailable(order = 'memory', limit = 2, maxLoad = 1.0, maxMemory = (1-(model_memory/total_memory)), includeNan=False, excludeID=[], excludeUUID=[])
        for i in range(len(tempID)):
            if len(tempID) > 0 and (tempID[i]==gpu_id):
                print("Found a gpu")
                print('We will use the GPU:',tempID[i],torch.cuda.get_device_name(tempID[i]))
                deviceID=[tempID[i]]
                return deviceID
            else:
                time.sleep(5)
                
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

if torch.cuda.is_available() and params['device']=='cuda':    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    ##### You can set the device manually if you have only one gpu
    ##### comment this line if you don't want to manually set the gpu
    #deviceID = get_gpu(args.gpuid)
    deviceID = get_gpu(1)
    torch.cuda.set_device(deviceID[0])
    #### comment this line if you want to manually set the gpu
    #### required parameter is the gpu id
    #torch.cuda.set_device(args.gpuid)

else:
    print('Since you dont want to use GPU, using the CPU instead.')
    device = torch.device("cpu")


There are 2 GPU(s) available.
Found a gpu
We will use the GPU: 1 Tesla P100-PCIE-16GB


In [9]:
class modelPred_lime():
    def __init__(self):
#         self.device = torch.device("cuda")
       
        if('roberta' in params['cache_path']):
            self.model = RobertaForRegression.from_pretrained(
                    params['cache_path'],
                    params={'dropout':0.2},local_files_only=True).to(device)
            
        elif('Hate-speech-CNERG/dehatebert-mono-english' in params['cache_path']):
            self.model = HateAlert.from_pretrained(
                    params['cache_path'],
                    params={'dropout':0.2}).to(device)

        else:
            self.model = BertForRegression.from_pretrained(
                    params['cache_path'],
                    params={'dropout':0.2},  local_files_only=True).to(device)



        if('roberta' in params['cache_path']):
            self.tokenizer = RobertaTokenizer.from_pretrained(params['cache_path'])
        
        elif('dehatebert' in params['cache_path']):
            self.tokenizer = AutoTokenizer.from_pretrained(params['cache_path'])
        
        else:
            self.tokenizer = BertTokenizer.from_pretrained(params['cache_path'])

        # print("Model Loaded!")
        # self.model.cuda()  
        # self.model.eval()

    # def preprocess_func(self, text):
    #     remove_words=['<allcaps>','</allcaps>','<hashtag>','</hashtag>','<elongated>','<emphasis>','<repeated>','\'','s']
    #     word_list=text_processor.pre_process_doc(text)
    #     word_list=list(filter(lambda a: a not in remove_words, word_list)) 
    #     sent=" ".join(word_list)
    #     sent = re.sub(r"[<\*>]", " ",sent)
    #     return sent

    def predict(self, model, dataloader):
        predicted_label = []
        actual_label = []
        self.model.eval()
        with torch.no_grad():
            for step,data in tqdm(enumerate(dataloader, 0), total=len(dataloader)):
                input_ids = data['ids'].to(device, dtype = torch.long)
                attention_mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float32)
                targets = targets.unsqueeze(1)

    #             input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
                output = model(input_ids, attention_mask)
                            
                predicted_label += output[0]
                actual_label += targets
                
        return predicted_label

    def prediction(self, test_df):


        test = pd.DataFrame()
        test['text'] = test_df.copy()
        test['label'] = 0
#         print(test)

        testing_set = Triage(test, self.tokenizer, params)

        test_params = {'batch_size': 128,
                       'shuffle': False,
                        'num_workers': 0
                        }

        test_loader = DataLoader(testing_set, **test_params)

        output = self.predict(self.model, test_loader)

        out2 =[]
        for out in output:
            out2.append(out.cpu().detach().numpy())

        out = np.array(out2).reshape((len(out2),-1))
#         out = np.hstack((out, 1-out))
#         out = out.tolist()
        
        
        return out 

In [10]:
modelClass=modelPred_lime()

In [11]:
test_df = pd.read_csv(params['dataset'])

less = test_df['less_toxic'].copy()
more = test_df['more_toxic'].copy()

In [12]:
test_df['less_score'] = modelClass.prediction(less)
test_df['more_score'] = modelClass.prediction(more)
test_df['diff'] = test_df['less_score'] - test_df['more_score']

100%|██████████| 236/236 [05:36<00:00,  1.43s/it]
100%|██████████| 236/236 [05:46<00:00,  1.47s/it]


In [13]:
# test_df.drop(columns = ['worker'], inplace=True)

In [14]:
test_df.to_csv('Predicitions/' + params['model'] + '.csv', index=False)

In [15]:
misclass = test_df[test_df['diff'] >= 0]

In [16]:
misclass.sort_values('diff', ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclass.sort_values('diff', ascending=False, inplace=True)


In [17]:
misclass.drop_duplicates(subset=['less_toxic', 'more_toxic'], keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclass.drop_duplicates(subset=['less_toxic', 'more_toxic'], keep='first', inplace=True)


In [18]:
misclass = misclass.head(200)

In [19]:
misclass.to_csv('Top200Misclassified/' + params['model'] + '.csv', index=False)