In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt_model = GPT2LMHeadModel.from_pretrained('../models/gpt2_output').to('cuda')
gpt_tokenizer = GPT2Tokenizer.from_pretrained('../models/gpt2_output')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

t5_model = T5ForConditionalGeneration.from_pretrained("../models/t5_output").to('cuda')
t5_tokenizer = T5Tokenizer.from_pretrained("../models/t5_output")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import pandas as pd

toxic_comments = pd.read_csv('../data/external/Toxic_Comment_Classification_Challenge_test.csv')

toxic_comments = toxic_comments[toxic_comments['comment_text'].str.len() <= 64][1000:2000]
# toxic_comments.drop(toxic_comments.loc[len(toxic_comments['comment_text']) <64].index, inplace=True)

# toxic_comments = toxic_comments[toxic_comments["comment_text"] < 64]
toxic_comments

Unnamed: 0,id,comment_text
4930,085453a6846c51f7,I HATE YOUR STOOPID WHINEY CRAPPY APLLE CRUMPL...
4932,085555ace59a551f,chris is a good a ctor on zoey
4941,0857bdcf937d8922,the earth is clearly flat and not round
4942,08593a0a9f81c618,ruman is a mad cunt
4945,085a1dad57000a05,"]] 23:16, 12 November 2009"
...,...,...
9872,107cd416ead2d8de,"OK, I'll get rid of it then."
9874,107d921647a7f744,"hehehehe, ganun ba?! wahaha!"
9877,107e970822ab3e37,"Why do you say that, he still sounded damn goo..."
9878,107ea0dc7f1cc3fe,he is a ffuckin asshole who fucked me in the ass


In [4]:
toxic_comments['for_gpt'] = "User:" + toxic_comments['comment_text'].astype(str) + "\nAssistant:"
toxic_comments['for_gpt']

4930    User:I HATE YOUR STOOPID WHINEY CRAPPY APLLE C...
4932      User:chris is a good a ctor on zoey\nAssistant:
4941    User:the earth is clearly flat and not round\n...
4942                 User:ruman is a mad cunt\nAssistant:
4945          User:]] 23:16, 12 November 2009\nAssistant:
                              ...                        
9872        User:OK, I'll get rid of it then.\nAssistant:
9874        User:hehehehe, ganun ba?! wahaha!\nAssistant:
9877    User:Why do you say that, he still sounded dam...
9878    User:he is a ffuckin asshole who fucked me in ...
9884    User:hello  \n\n hey jesusfreek2 im jesusfreek...
Name: for_gpt, Length: 1000, dtype: object

In [5]:
from tqdm import tqdm


def evaluate_gpt(s: str, model, tokenizer, bar = None) -> str:
    if bar is not None:
        bar.update(1)
    if len(s) > 64:
        return None

    # Convert string to array of embedding indices
    encoded_input = tokenizer(s, return_tensors='pt').to('cuda')
    # Pass embedding indices to the model
    output = model.generate(encoded_input.input_ids, pad_token_id=tokenizer.pad_token_id, max_length=142)
    # Decode resulting embedding indices back to string
    output_str = tokenizer.batch_decode(output, skip_special_tokens=True)
    
    output_str = output_str[0]

    # Truncate string to the next "User:" occurrence, if any.
    first_idx: int = output_str.find('User:')
    idx = output_str.find('User', len('User'))
    if idx != -1 and idx != first_idx:
        output_str = output_str[len(s):idx]
    return output_str

In [6]:
pbar = tqdm(total=len(toxic_comments['for_gpt']))
if gpt_tokenizer.pad_token is None:
    gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
toxic_comments['gpt'] = toxic_comments.apply(lambda x: evaluate_gpt(x['for_gpt'], gpt_model, gpt_tokenizer, pbar), axis=1)
pbar.close()

100%|██████████| 1000/1000 [21:00<00:00,  1.26s/it]


In [7]:
challenge_res = pd.read_csv('../data/interim/toxic_comment_classific_challenge_model_result.csv')
toxic_comments.to_csv('../data/interim/toxic_comment_classific_challenge_model_result.csv')

In [8]:
toxic_comments['for_t5'] = "Detoxify: " + toxic_comments['comment_text'].astype(str)
toxic_comments['for_t5']

4930    Detoxify: I HATE YOUR STOOPID WHINEY CRAPPY AP...
4932             Detoxify: chris is a good a ctor on zoey
4941    Detoxify: the earth is clearly flat and not round
4942                        Detoxify: ruman is a mad cunt
4945                 Detoxify: ]] 23:16, 12 November 2009
                              ...                        
9872               Detoxify: OK, I'll get rid of it then.
9874               Detoxify: hehehehe, ganun ba?! wahaha!
9877    Detoxify: Why do you say that, he still sounde...
9878    Detoxify: he is a ffuckin asshole who fucked m...
9884    Detoxify: hello  \n\n hey jesusfreek2 im jesus...
Name: for_t5, Length: 1000, dtype: object

In [9]:

def evaluate_T5(s: str, model, tokenizer, bar = None) -> str:
    if bar is not None:
        bar.update(1)
    if len(s) > 64:
        return None
    encoded_input = tokenizer(s, padding="max_length", truncation=True, return_tensors='pt').to('cuda')
    output = model.generate(encoded_input.input_ids)
    output_str = tokenizer.batch_decode(output, skip_special_tokens=True)
    # print(output_str)
    return output_str

In [10]:
pbar = tqdm(total=len(toxic_comments['for_t5']))
# if gpt_tokenizer.pad_token is None:
#             gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
toxic_comments['t5'] = toxic_comments.apply(lambda x: evaluate_T5(x['for_t5'], t5_model, t5_tokenizer, pbar), axis=1)
pbar.close()

100%|██████████| 1000/1000 [02:52<00:00,  5.79it/s]


In [11]:
toxic_comments.dropna(inplace=True)
toxic_comments

Unnamed: 0,id,comment_text,for_gpt,gpt,for_t5,t5
4932,085555ace59a551f,chris is a good a ctor on zoey,User:chris is a good a ctor on zoey\nAssistant:,Kris is a good ctor on Zoey,Detoxify: chris is a good a ctor on zoey,[Chris is a good sex doc on zoey.]
4941,0857bdcf937d8922,the earth is clearly flat and not round,User:the earth is clearly flat and not round\n...,the earth is flat and not round.,Detoxify: the earth is clearly flat and not round,[the earth is clearly flat and not round.]
4942,08593a0a9f81c618,ruman is a mad cunt,User:ruman is a mad cunt\nAssistant:,Ruman is crazy,Detoxify: ruman is a mad cunt,[Ruman is a crazy cunt.]
4945,085a1dad57000a05,"]] 23:16, 12 November 2009","User:]] 23:16, 12 November 2009\nAssistant:","23:16, 12 November 2009","Detoxify: ]] 23:16, 12 November 2009","[23:16, 12 November 2009]"
4946,085a2f633bdb1432,== WOT the fuck == \n\n Check this out please ),User:== WOT the fuck == \n\n Check this out pl...,"check this out, please!",Detoxify: == WOT the fuck == \n\n Check this o...,[check this out please )]
...,...,...,...,...,...,...
9871,107c3ccb2abeac27,"aside from the de-wp one,","User:aside from the de-wp one,\nAssistant:","besides the de-pupil,","Detoxify: aside from the de-wp one,","[aside from the de-wp,]"
9872,107cd416ead2d8de,"OK, I'll get rid of it then.","User:OK, I'll get rid of it then.\nAssistant:","OK, I'll get rid of him.","Detoxify: OK, I'll get rid of it then.","[OK, I'll get rid of it.]"
9874,107d921647a7f744,"hehehehe, ganun ba?! wahaha!","User:hehehehe, ganun ba?! wahaha!\nAssistant:","oh, shit, Ganun Ba?","Detoxify: hehehehe, ganun ba?! wahaha!","[hehehe, ganun ba?]"
9878,107ea0dc7f1cc3fe,he is a ffuckin asshole who fucked me in the ass,User:he is a ffuckin asshole who fucked me in ...,he's a guy who's got me in the bag.,Detoxify: he is a ffuckin asshole who fucked m...,[he's a fucking asshole who fucked me in]


In [12]:
toxic_comments.to_csv('../data/interim/toxic_comment_classific_challenge_model_result.csv')
# toxic_comments = pd.read_csv('../data/interim/toxic_comment_classific_challenge_model_result.csv')

In [13]:
challenge_res.to_csv("old_res.csv")
challenge_res

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,comment_text,for_gpt,gpt,for_t5,t5
0,4920,,,,08509fe52c645782,Congrats. Now don't tear the place to pieces!!,User:Congrats. Now don't tear the place to pie...,"congratulations, now don't tear this place ap...",Detoxify: Congrats. Now don't tear the place t...,"[""congratulations, don't tear the place to pie..."
1,4919,,,,084fc76456f0cd29,REDIRECT Talk:The English High School,User:REDIRECT Talk:The English High School\nAs...,English High School,Detoxify: REDIRECT Talk:The English High School,['a direct talk about English High School']
2,4899,,,,0845333dfa416d04,:LOL. Give me a fucking break.,User::LOL. Give me a fucking break.\nAssistant:,give me a break.,Detoxify: :LOL. Give me a fucking break.,['give me a break.']
3,4897,,,,084520c88014b552,killed by Israel and,User:killed by Israel and\nAssistant:,he killed Israel.,Detoxify: killed by Israel and,['the death of Israel and the death of the Jew...
4,4894,,,,0844465cff6d83f9,I... err... what?,User:I... err... what?\nAssistant:,what the hell...?,Detoxify: I... err... what?,['I... what?']
...,...,...,...,...,...,...,...,...,...,...
510,368,4.0,6.0,31.0,000c9b92318552d1,Professors to the Manhatten Project.,User:Professors to the Manhatten Project.\nAss...,professor of the Manhatten Project.,Detoxify: Professors to the Manhatten Project.,['professors to the Manhatten Project.']
511,369,3.0,5.0,19.0,000844b52dee5f3f,|blocked]] from editing Wikipedia. |,User:|blocked]] from editing Wikipedia. |\nA...,he blocked the editors from editing Wikipedia.,Detoxify: |blocked]] from editing Wikipedia. |,['a blocked from editing Wikipedia.']
512,370,2.0,4.0,16.0,000663aff0fffc80,this other one from 1897,User:this other one from 1897\nAssistant:,this is the other one from 1897,Detoxify: this other one from 1897,['the other one from 1897.']
513,371,1.0,3.0,7.0,000247e83dcc1211,:Dear god this site is horrible.,User::Dear god this site is horrible.\nAssistant:,"God, this is terrible.",Detoxify: :Dear god this site is horrible.,[':Dear God this site is terrible.']


In [14]:
result = pd.concat([challenge_res, toxic_comments]).sort_values(by=['id'], ascending=False)
result.to_csv('../data/interim/toxic_comment_classific_challenge_model_result.csv')

In [15]:
result

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,comment_text,for_gpt,gpt,for_t5,t5
9884,,,,,1081e7bb8feb8512,hello \n\n hey jesusfreek2 im jesusfreek2 2ÐÐ...,User:hello \n\n hey jesusfreek2 im jesusfreek...,User:hello \n\n hey jesusfreek2 im jesusfreek...,Detoxify: hello \n\n hey jesusfreek2 im jesus...,"[hello, Jesus Freek2 im Jesus Freek2 - 2]"
9878,,,,,107ea0dc7f1cc3fe,he is a ffuckin asshole who fucked me in the ass,User:he is a ffuckin asshole who fucked me in ...,he's a guy who's got me in the bag.,Detoxify: he is a ffuckin asshole who fucked m...,[he's a fucking asshole who fucked me in]
9874,,,,,107d921647a7f744,"hehehehe, ganun ba?! wahaha!","User:hehehehe, ganun ba?! wahaha!\nAssistant:","oh, shit, Ganun Ba?","Detoxify: hehehehe, ganun ba?! wahaha!","[hehehe, ganun ba?]"
9872,,,,,107cd416ead2d8de,"OK, I'll get rid of it then.","User:OK, I'll get rid of it then.\nAssistant:","OK, I'll get rid of him.","Detoxify: OK, I'll get rid of it then.","[OK, I'll get rid of it.]"
9871,,,,,107c3ccb2abeac27,"aside from the de-wp one,","User:aside from the de-wp one,\nAssistant:","besides the de-pupil,","Detoxify: aside from the de-wp one,","[aside from the de-wp,]"
...,...,...,...,...,...,...,...,...,...,...
510,368.0,4.0,6.0,31.0,000c9b92318552d1,Professors to the Manhatten Project.,User:Professors to the Manhatten Project.\nAss...,professor of the Manhatten Project.,Detoxify: Professors to the Manhatten Project.,['professors to the Manhatten Project.']
511,369.0,3.0,5.0,19.0,000844b52dee5f3f,|blocked]] from editing Wikipedia. |,User:|blocked]] from editing Wikipedia. |\nA...,he blocked the editors from editing Wikipedia.,Detoxify: |blocked]] from editing Wikipedia. |,['a blocked from editing Wikipedia.']
512,370.0,2.0,4.0,16.0,000663aff0fffc80,this other one from 1897,User:this other one from 1897\nAssistant:,this is the other one from 1897,Detoxify: this other one from 1897,['the other one from 1897.']
513,371.0,1.0,3.0,7.0,000247e83dcc1211,:Dear god this site is horrible.,User::Dear god this site is horrible.\nAssistant:,"God, this is terrible.",Detoxify: :Dear god this site is horrible.,[':Dear God this site is terrible.']
