In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%time
! pip install --upgrade --force-reinstall --no-deps  kaggle > /dev/null
! mkdir ~/.kaggle

! cp "/content/drive/My Drive/Kaggle/kaggle.json" ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

import os

if not os.path.exists("/content/input/"):
    !mkdir input
    !kaggle datasets download -d abhishek/roberta-base
    !unzip roberta-base.zip -d input/roberta-base

    !kaggle competitions download -c jigsaw-toxic-severity-rating
    !unzip jigsaw-toxic-severity-rating.zip -d input/jigsaw-toxic-severity-rating

    !kaggle datasets download -d ishandutta/jigsaw-folds
    !unzip jigsaw-folds.zip -d input/jigsaw-folds

Downloading roberta-base.zip to /content
100% 1.31G/1.31G [00:10<00:00, 144MB/s]
100% 1.31G/1.31G [00:10<00:00, 135MB/s]
Archive:  roberta-base.zip
  inflating: input/roberta-base/README.md  
  inflating: input/roberta-base/config.json  
  inflating: input/roberta-base/dict.txt  
  inflating: input/roberta-base/flax_model.msgpack  
  inflating: input/roberta-base/merges.txt  
  inflating: input/roberta-base/pytorch_model.bin  
  inflating: input/roberta-base/rust_model.ot  
  inflating: input/roberta-base/tf_model.h5  
  inflating: input/roberta-base/tokenizer.json  
  inflating: input/roberta-base/vocab.json  
Downloading jigsaw-toxic-severity-rating.zip to /content
 74% 5.00M/6.72M [00:00<00:00, 38.3MB/s]
100% 6.72M/6.72M [00:00<00:00, 49.5MB/s]
Archive:  jigsaw-toxic-severity-rating.zip
  inflating: input/jigsaw-toxic-severity-rating/comments_to_score.csv  
  inflating: input/jigsaw-toxic-severity-rating/sample_submission.csv  
  inflating: input/jigsaw-toxic-severity-rating/validat

In [3]:
!pip install wandb
!pip install transformers
!pip install pytorch_lightning
!pip install colorama

Collecting wandb
  Downloading wandb-0.12.9-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 7.5 MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.1-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 53.4 MB/s 
[?25hCollecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 6.5 MB/s 
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting configparser>=3.8.1
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 57.6 MB/s 
Colle

---

---

In [4]:
cd /content/drive/MyDrive/python/kaggle/jigsaw_study_lightning/code/

/content/drive/MyDrive/python/kaggle/jigsaw_study_lightning/code


In [5]:
# Necessities
import wandb
import pandas as pd

import datetime
import os

from tqdm import tqdm
import numpy as np
import gc

# PyTorch
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Transformers
from transformers import AutoTokenizer, AutoModel, AdamW

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

# Colored Terminal Text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

import pathlib
from glob import glob
import yaml
p_temp = pathlib.Path('.')
### config yamlの指定はここ。
yaml_path = list(p_temp.glob('**/*.yaml'))[0]
print("config file ----> ", yaml_path)

with open(yaml_path, 'r') as yml:
    CONFIG = yaml.safe_load(yml)
CONFIG = CONFIG['train_args']
CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG["tokenizer"])

config file ---->  config/config.yaml


In [6]:
class JigsawModel(pl.LightningModule):
    
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
                    
        return outputs
    
    def training_step(self, batch, batch_idx):
        more_toxic_ids = batch['more_toxic_ids']
        more_toxic_mask = batch['more_toxic_mask']
        less_toxic_ids = batch['less_toxic_ids']
        less_toxic_mask = batch['less_toxic_mask']
        targets = batch['target']
        
        more_toxic_outputs = self(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = self(less_toxic_ids, less_toxic_mask)
        
        loss = self.criterion(more_toxic_outputs, less_toxic_outputs, targets)
        
        self.log("train_loss", loss, prog_bar=True, logger=True)
        
        return {"loss": loss}
    
    def validation_step(self, batch, batch_idx):
        more_toxic_ids = batch['more_toxic_ids']
        more_toxic_mask = batch['more_toxic_mask']
        less_toxic_ids = batch['less_toxic_ids']
        less_toxic_mask = batch['less_toxic_mask']
        targets = batch['target']
        
        more_toxic_outputs = self(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = self(less_toxic_ids, less_toxic_mask)
        
        loss = self.criterion(more_toxic_outputs, less_toxic_outputs, targets)
        
        self.log("val_loss", loss, prog_bar=True, logger=True)
        
        return {'val_loss': loss}      
        
    def configure_optimizers(self):
        
        optimizer = AdamW(self.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
        scheduler = fetch_scheduler(optimizer)
        
        return dict(
            optimizer = optimizer,
            lr_scheduler = scheduler
        )
    
    def criterion(self, outputs1, outputs2, targets):
        return nn.MarginRankingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)

## inferernce

In [26]:
test = pd.read_csv("/content/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
sample_submission = pd.read_csv("/content/input/jigsaw-toxic-severity-rating/sample_submission.csv")

In [27]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

In [28]:
test_dataset = JigsawDataset(test, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['train_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

In [7]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

In [42]:
final_preds = []
jigsawamodel = JigsawModel(CONFIG['model_name']) 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

checkpoint = glob("../models/roberta-base/**/*")
model_paths = [x for x in checkpoint if CONFIG["exp_name"] in x]

for i, path in enumerate(model_paths):
    print(f" @@@@@@@@@@@@@@@@@@ fold{i} @@@@@@@@@@@@@@@@@@")
    model = jigsawamodel.load_from_checkpoint(checkpoint_path=path, model_name=CONFIG['model_name'])
    model.to(device)

    print(f"Getting predictions for model {i+1}")
    preds = valid_fn(model, test_loader, device)
    final_preds.append(preds)

final_preds = np.array(final_preds)
final_preds = np.mean(final_preds, axis=0)

Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 @@@@@@@@@@@@@@@@@@ fold0 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 1


100%|██████████| 236/236 [00:30<00:00,  7.74it/s]


 @@@@@@@@@@@@@@@@@@ fold1 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 2


100%|██████████| 236/236 [00:30<00:00,  7.74it/s]


 @@@@@@@@@@@@@@@@@@ fold2 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 3


100%|██████████| 236/236 [00:30<00:00,  7.73it/s]


 @@@@@@@@@@@@@@@@@@ fold3 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 4


100%|██████████| 236/236 [00:30<00:00,  7.73it/s]


 @@@@@@@@@@@@@@@@@@ fold4 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 5


100%|██████████| 236/236 [00:30<00:00,  7.73it/s]


In [43]:
sample_submission['score'] = final_preds
sample_submission['score'] = sample_submission['score'].rank(method='first')

In [52]:
exp_name = CONFIG["exp_name"]
sample_submission.to_csv(f"../outputs/{exp_name}_submission.csv")

In [47]:
!ls "/content/input/jigsaw-toxic-severity-rating/"

comments_to_score.csv  sample_submission.csv  validation_data.csv


## validation

In [8]:
val_df = pd.read_csv("/content/input/jigsaw-toxic-severity-rating/validation_data.csv")

In [9]:
class ValJigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }


In [10]:
test_dataset = ValJigsawDataset(val_df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['train_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

In [16]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS_less = []
    PREDS_more = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        less_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_mask = data['less_toxic_mask'].to(device, dtype = torch.long)

        more_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_mask = data['more_toxic_mask'].to(device, dtype = torch.long)

        less_outputs = model(less_ids, less_mask)
        PREDS_less.append(less_outputs.view(-1).cpu().detach().numpy()) 
        
        more_outputs = model(more_ids, more_mask)
        PREDS_more.append(more_outputs.view(-1).cpu().detach().numpy()) 

    PREDS_less = np.concatenate(PREDS_less)
    PREDS_more = np.concatenate(PREDS_more)
    gc.collect()
    
    return PREDS_less,PREDS_more

In [17]:
final_preds_less = []
final_preds_more = []
jigsawamodel = JigsawModel(CONFIG['model_name']) 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

checkpoint = glob("../models/roberta-base/**/*")
model_paths = [x for x in checkpoint if CONFIG["exp_name"] in x]

for i, path in enumerate(model_paths):
    print(f" @@@@@@@@@@@@@@@@@@ fold{i} @@@@@@@@@@@@@@@@@@")
    model = jigsawamodel.load_from_checkpoint(checkpoint_path=path, model_name=CONFIG['model_name'])
    model.to(device)

    print(f"Getting predictions for model {i+1}")
    preds_less, preds_more = valid_fn(model, test_loader, device)
    final_preds_less.append(preds_less)
    final_preds_more.append(preds_more)

final_preds_less = np.array(final_preds_less)
final_preds_less_mean = np.mean(final_preds_less, axis=0)

final_preds_more = np.array(final_preds_more)
final_preds_more_mean = np.mean(final_preds_more, axis=0)

Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 @@@@@@@@@@@@@@@@@@ fold0 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 1


100%|██████████| 941/941 [03:58<00:00,  3.94it/s]


 @@@@@@@@@@@@@@@@@@ fold1 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 2


100%|██████████| 941/941 [03:59<00:00,  3.93it/s]


 @@@@@@@@@@@@@@@@@@ fold2 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 3


100%|██████████| 941/941 [03:59<00:00,  3.93it/s]


 @@@@@@@@@@@@@@@@@@ fold3 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 4


100%|██████████| 941/941 [03:58<00:00,  3.94it/s]


 @@@@@@@@@@@@@@@@@@ fold4 @@@@@@@@@@@@@@@@@@


Some weights of the model checkpoint at /content/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 5


100%|██████████| 941/941 [03:59<00:00,  3.94it/s]


In [20]:
exp_name = CONFIG["exp_name"]

for i, (less, more) in enumerate(zip(final_preds_less, final_preds_more)):
    less_colomn = "pred_less_toxic"+"fold"+str(i)
    more_colomn = "pred_more_toxic"+"fold"+ str(i)
    val_df[less_colomn] = less
    val_df[more_colomn] = more

val_df.to_csv(f"../outputs/{exp_name}.csv", index=False)

Unnamed: 0,worker,less_toxic,more_toxic,pred_less_toxicfold0,pred_more_toxicfold0,pred_less_toxicfold1,pred_more_toxicfold1,pred_less_toxicfold2,pred_more_toxicfold2,pred_less_toxicfold3,pred_more_toxicfold3,pred_less_toxicfold4,pred_more_toxicfold4
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,-0.332444,-0.224163,-0.077043,0.162932,0.005236,0.126551,0.289431,0.436869,-0.390523,-0.002944
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,-0.467329,-0.054671,-0.301800,0.001471,-0.304686,0.261400,-0.224889,0.337132,-0.473689,0.078186
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",-0.553231,-0.062983,-0.327746,-0.100070,-0.125925,0.137143,0.211031,0.329634,-0.309552,-0.051786
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,0.019429,0.585690,0.154760,0.462644,0.217152,0.808583,0.456578,0.760138,0.027323,0.326627
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",-0.026947,0.093572,-0.096769,-0.038192,0.265181,0.794434,0.393926,0.602004,0.093472,0.228936
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30103,461,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,-0.584017,0.381515,-0.310198,0.415605,-0.180809,0.808160,0.168984,0.916185,-0.506556,0.293530
30104,527,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,-0.584017,0.381515,-0.310198,0.415605,-0.180809,0.808160,0.168984,0.916185,-0.506556,0.293530
30105,352,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,-0.644852,0.110157,-0.489393,0.061433,-0.363664,0.515793,-0.321488,0.466005,-0.619558,0.054510
30106,311,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,-0.644852,0.110157,-0.489393,0.061433,-0.363664,0.515793,-0.321488,0.466005,-0.619558,0.054510
