# Condemnation Inference
This notebook will hold the code for inferring condemnation on all data

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import pickle
import pandas as pd
from datasets import Value
from torch.utils.data import DataLoader
import argparse
import os
from pymongo import MongoClient
import IPython
from tqdm import tqdm

## Loading 

In [3]:
client = MongoClient()
db_metoo_tweets = client["jason_twitter"]
metoo_tweets = db_metoo_tweets.tweets

In [4]:
tweet_cursor = metoo_tweets.find()

In [5]:

checkpoint = "bert-base-uncased"
model_path = "./models/hf"

# model = torch.load(model_path)
# model.eval()

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [6]:
def split_list(cursor, n):
        result = []
        for tweet in tqdm(cursor):
            result.append(tweet)
            if len(result)==n:
                result_to_return = result
                result = []
                yield result_to_return
        yield result
        
def update_tweet_in_db( document):
    try:
        metoo_tweets.update_one(
            {'_id': document['_id']},
            {'$set': document}
        )
    except Exception:
        print("couldn't update ", document)

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="exp/bart/results",
    do_train=False,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
)
# training_args = TrainingArguments("test-trainer")
# training_args.eval_accumulation_steps = 1  # pushes predictions out of GPU to mitigate GPU out of memory

trainer = Trainer(
    model = AutoModelForSequenceClassification.from_pretrained("models/hf/hf_fold_{}_model./"),
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
n=100000
target_col = "clean_tweet"
for idx, chunk in tqdm(enumerate(split_list(tweet_cursor, n))):
        df = pd.DataFrame(chunk)
        if not len(df):
            break
        pred_df = df[[target_col]].dropna()
        pred_data = df.dropna(subset=[target_col])
        pred_dataset = Dataset.from_pandas(pred_df)
        pred_dataset = pred_dataset.rename_column(target_col, "text")
        tokenized_datasets = pred_dataset.map(tokenize_function, batched=True)
        predictions_logits = trainer.predict(tokenized_datasets)
        preds = np.argmax(predictions_logits.predictions, axis = 1)
        save_df = pred_data[["_id", "clean_tweet"]]
        save_df["condemnation_prediction"] = preds
        save_df["condemnation_logit_0"] = predictions_logits.predictions[:, 0]
        save_df["condemnation_logit_1"] = predictions_logits.predictions[:, 1]
        del tokenized_datasets
        del pred_dataset
        for idx, row in tqdm(save_df.iterrows()):
            update_tweet_in_db(row.to_dict())
        

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
102it [00:00, 230.14it/s][A
3360it [00:00, 4799.95it/s][A
6887it [00:01, 6321.49it/s][A
10003it [00:01, 6390.82it/s][A
13238it [00:02, 6159.03it/s][A
16408it [00:02, 5601.37it/s][A
19612it [00:03, 7049.39it/s][A
22795it [00:03, 5652.12it/s][A
25961it [00:04, 7040.67it/s][A
29123it [00:04, 8485.89it/s][A
32250it [00:05, 5688.13it/s][A
35412it [00:05, 7063.42it/s][A
38532it [00:05, 8397.69it/s][A
41667it [00:06, 5147.42it/s][A
44812it [00:07, 6459.06it/s][A
47910it [00:07, 7808.83it/s][A
51067it [00:07, 9190.99it/s][A
54221it [00:08, 4762.81it/s][A
57435it [00:09, 6100.48it/s][A
60677it [00:09, 7607.04it/s][A
63940it [00:09, 9181.44it/s][A
67241it [00:11, 4309.81it/s][A
70601it [00:11, 5635.92it/s][A
73878it [00:11, 7095.24it/s][A
77122it [00:11, 8587.69it/s][A
80440it [00:11, 10229.90it/s][A
83764it [00:12, 11769.96it/s][A
87085it [00:14, 4040.31it/s] [A
90446it [00:14, 5268.29it/s][A
93664it [00:14, 6574.40it/s][A
9

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
1it [05:18, 318.81s/it]
100000it [05:18, 35.20it/s] [A
100148it [05:19, 35.86it/s][A
103375it [05:19, 57.35it/s][A
106558it [05:19, 87.55i

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]


3897411it [2:54:02, 15076.78it/s][AThe following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A v

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy o

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128

5699686it [4:17:34, 14556.87it/s][A

In [71]:
del pred_dataset

In [59]:
len(df)

0

In [54]:
update_tweet_in_db(row.to_dict())

In [27]:
pred_df

Unnamed: 0,clean_tweet
0,<TWEET>: Olivia Munn is promoting 'Predator' e...
1,<TWEET>: Continua la polémica con la película ...
2,<TWEET>: Olivia Munn says 'Predator' cast shun...
3,<TWEET>: It would appear that society allows m...
4,<TWEET>: 'The Predator': Shane Black Apologize...
...,...
95,"<TWEET>: La compañía llegó a esta conclusión, ..."
96,<TWEET>: .@20thcenturyfox quitó una escena con...
97,"<TWEET>: Just hours before the premiere of ""Th..."
98,<TWEET>: .@20thcenturyfox quitó una escena con...


In [31]:
pred_dataset

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 100
})

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file models/hf/hf_fold_{}_model./config.json
Model config BertConfig {
  "_name_or_path": "models/hf/hf_fold_{}_model./",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 128


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [45]:
save_df

Unnamed: 0,_id,clean_tweet,condemnation_prediction,condemnation_logit_0,condemnation_logit_1
0,6383e2adfa2b796ff3841c9d,<TWEET>: Olivia Munn is promoting 'Predator' e...,1,-1.078831,1.067021
1,6383e2adfa2b796ff3841c9e,<TWEET>: Continua la polémica con la película ...,0,0.857920,-0.953911
2,6383e2adfa2b796ff3841c9f,<TWEET>: Olivia Munn says 'Predator' cast shun...,1,-0.124301,-0.101442
3,6383e2adfa2b796ff3841ca0,<TWEET>: It would appear that society allows m...,1,-0.949958,0.924150
4,6383e2adfa2b796ff3841ca1,<TWEET>: 'The Predator': Shane Black Apologize...,0,0.188017,-0.208660
...,...,...,...,...,...
95,6383e2adfa2b796ff3841cfc,"<TWEET>: La compañía llegó a esta conclusión, ...",0,1.618347,-1.706561
96,6383e2adfa2b796ff3841cfd,<TWEET>: .@20thcenturyfox quitó una escena con...,0,1.414179,-1.495095
97,6383e2adfa2b796ff3841cfe,"<TWEET>: Just hours before the premiere of ""Th...",0,0.418434,-0.595623
98,6383e2adfa2b796ff3841cff,<TWEET>: .@20thcenturyfox quitó una escena con...,0,1.414179,-1.495095


100it [00:00, 1374.38it/s]


"<TWEET>: Teen in Steven Wilder Striegel's 2010 sex abuse case comes forward to 'reclaim my identity'"

In [11]:
model_path = "./models/fold_1_model.p"
model = torch.load(model_path)
model.eval()
# model.to("cuda:0")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
for tweet in tweet_cursor:
    break

In [9]:
tweet["clean_tweet"]

"<TWEET>: Teen in Steven Wilder Striegel's 2010 sex abuse case comes forward to 'reclaim my identity'"

In [12]:
pipe(tweet["clean_tweet"])

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [51]:
counter = 0
for tweet in tqdm(tweet_cursor):
#     print(tweet)
    if "clean_tweet_masked" not in tweet:
        continue
    else:
        pipe(tweet["clean_tweet_masked"])

0it [00:00, ?it/s]


TypeError: new(): invalid data type 'str'

In [15]:
tweet["clean_tweet_masked"]

'<TWEET>: RT @rorrobracho: Osea, según este diario "se nos cayó el ídolo" porque <TARGET 1> salió del closet y no porque le acusaron de acoso a un…'

In [16]:
tweet["favoritesCount"]

55

In [17]:
tweet["retweetCount"]

71

In [19]:
tweet["time_check"]

{'spacey': {'valid': True,
  'public_date': datetime.datetime(2017, 10, 29, 0, 0)}}

In [20]:
tweet["postedTime"]

datetime.datetime(2017, 10, 31, 17, 2, 35)

In [18]:
tweet.keys()

dict_keys(['_id', 'body', 'postedTime', 'retweetCount', 'favoritesCount', 'quoted_status_id', 'quoted_status_user_id', 'quoted_status_body', 'quoted_status_user_postedTime', 'gnip_url_title', 'gnip_url_description', 'is_RT', 'RT_body', 'RT_user_id', 'RT_id', 'tweet_id', 'user_id', 'RT_target_mentions', 'body_target_mentions', 'body_target_mentions_validated', 'body_target_mentions_validated_true', 'gnip_url_title_mentions', 'lang_pred', 'lang_pred_prob', 'quoted_status_target_mentions', 'clean_targets_n', 'clean_tweet', 'clean_tweet_masked', 'mask_map', 'modified_quote_tweet', 'time_check'])

In [7]:
file = open('./condemnation_predictions.p', 'rb')
preds = pickle.load(file)
file.close()
preds.predictions

array([[-1.005583  ,  0.9191936 ],
       [-0.9339803 ,  0.862211  ],
       [-1.4016373 ,  1.344614  ],
       ...,
       [-0.13647401,  0.00870189],
       [-1.9797113 ,  1.8827175 ],
       [-0.26315764,  0.21105824]], dtype=float32)

In [2]:
file = open('../../data/5_mil_7days_metoo.p', 'rb')
pred_data = pickle.load(file)
file.close()

In [3]:
pred_df = pred_data[["clean_tweet_masked"]].dropna()
pred_df

Unnamed: 0,clean_tweet_masked
0,"<TWEET>: » “I do love you. . I love you, <TARG..."
4,<TWEET>: <TARGET 1> apologizes for 'aggressive...
5,<TWEET>: No one should have to endure this kin...
6,"<TWEET>: ""New <TARGET 1> sexual assault accusa..."
8,"<TWEET>: Yes this. <TARGET 1>, this clown, the..."
...,...
4683897,<TWEET>: On the one year anniversary of the Ac...
4683898,<TWEET>: <TARGET 1>'s photobombing. The ladies...
4683900,<TWEET>: But but but dude you're a rapist HOW ...
4683901,"<TWEET>: Lisa Bloom, Lawyer Advising <TARGET 1..."


# Trying Trainer

In [25]:
model_path = "./models/fold_1_model.p"
model = torch.load(model_path)
model.eval()
# model.to("cpu")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [34]:
model.to("cpu")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [35]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [36]:
tokenizer(tweet["clean_tweet_masked"])

{'input_ids': [101, 1026, 1056, 28394, 2102, 1028, 1024, 19387, 1030, 20996, 18933, 10024, 9905, 1024, 9808, 5243, 1010, 7367, 12734, 28517, 22939, 9488, 1000, 7367, 16839, 6187, 7677, 3449, 10282, 2080, 1000, 18499, 4226, 1026, 4539, 1015, 1028, 16183, 3695, 3972, 9346, 1061, 2053, 18499, 4226, 3393, 9353, 10383, 4948, 2139, 9353, 19137, 1037, 4895, 1529, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [37]:
from transformers import TextClassificationPipeline

In [38]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

In [39]:
pipe(tweet["clean_tweet_masked"])

[[{'label': 'LABEL_0', 'score': 0.9484457969665527},
  {'label': 'LABEL_1', 'score': 0.051554203033447266}]]

In [5]:
from datasets import Value
pred_dataset = Dataset.from_pandas(pred_df)
pred_dataset = pred_dataset.rename_column("clean_tweet_masked", "text")
new_features = pred_dataset.features.copy()
# new_features["text"] = Value('string')
pred_dataset = pred_dataset.cast(new_features)

  0%|          | 0/280 [00:00<?, ?ba/s]

In [6]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = pred_dataset.map(tokenize_function, batched=True)

  0%|          | 0/2798 [00:00<?, ?ba/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
from transformers import Trainer
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="exp/bart/results",
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
)
training_args = TrainingArguments("test-trainer")
training_args.eval_accumulation_steps=1 #pushes predictions out of GPU to mitigate GPU out of memory

trainer = Trainer(
    model,
    args=args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    
)

In [10]:
predictions = trainer.predict(tokenized_datasets[:100])

***** Running Prediction *****
  Num examples = 5
  Batch size = 8


KeyError: 0

In [None]:
with open('./condemnation_predictions', 'wb') as f:
    pickle.dump(predictions, f)

# Trying Pipeline

In [5]:
import datasets
from transformers import pipeline
from transformers.pipelines.base import *
from tqdm.auto import tqdm

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [18]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer,  device=0)

In [11]:
pred_df = pred_df.dropna(subset=["clean_tweet_masked"])

In [12]:
pred_df

Unnamed: 0,clean_tweet_masked
0,"<TWEET>: » “I do love you. . I love you, <TARG..."
4,<TWEET>: <TARGET 1> apologizes for 'aggressive...
5,<TWEET>: No one should have to endure this kin...
6,"<TWEET>: ""New <TARGET 1> sexual assault accusa..."
8,"<TWEET>: Yes this. <TARGET 1>, this clown, the..."
...,...
4683897,<TWEET>: On the one year anniversary of the Ac...
4683898,<TWEET>: <TARGET 1>'s photobombing. The ladies...
4683900,<TWEET>: But but but dude you're a rapist HOW ...
4683901,"<TWEET>: Lisa Bloom, Lawyer Advising <TARGET 1..."


In [23]:
from datasets import Value
pred_dataset = Dataset.from_pandas(pred_df)
pred_dataset = pred_dataset.rename_column("clean_tweet_masked", "text")
new_features = pred_dataset.features.copy()
# new_features["text"] = Value('string')
pred_dataset = pred_dataset.cast(new_features)

  0%|          | 0/280 [00:00<?, ?ba/s]

In [29]:
type(pred_dataset["text"][:10][0])

str

In [None]:
batch_size=8
for out in tqdm(pipe(pred_dataset["text"]), total=len(pred_dataset)):
    print(out)
    break

In [15]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [16]:
for chunk in chunker(pred_dataset["text"],):
    print(type(chunk))
    break

<class 'list'>


In [26]:
labels = []
i = 1
for chunk in tqdm(chunker(pred_dataset["text"],1)):
    labels.extend(pipe(chunk))
#     torch.cuda.empty_cache()
    i+=1
    if i%100:
        torch.cuda.empty_cache()

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [20]:
pipe(pred_dataset["text"][:32])

[{'label': 'LABEL_1', 'score': 0.8726699948310852},
 {'label': 'LABEL_1', 'score': 0.8576847314834595},
 {'label': 'LABEL_1', 'score': 0.9397012591362},
 {'label': 'LABEL_0', 'score': 0.9323407411575317},
 {'label': 'LABEL_1', 'score': 0.966101348400116},
 {'label': 'LABEL_1', 'score': 0.5745621919631958},
 {'label': 'LABEL_1', 'score': 0.5133087038993835},
 {'label': 'LABEL_1', 'score': 0.5648785829544067},
 {'label': 'LABEL_1', 'score': 0.9608886241912842},
 {'label': 'LABEL_1', 'score': 0.9589126110076904},
 {'label': 'LABEL_1', 'score': 0.9688040018081665},
 {'label': 'LABEL_1', 'score': 0.96010822057724},
 {'label': 'LABEL_0', 'score': 0.789070725440979},
 {'label': 'LABEL_1', 'score': 0.8477528691291809},
 {'label': 'LABEL_0', 'score': 0.5989922285079956},
 {'label': 'LABEL_1', 'score': 0.5745621919631958},
 {'label': 'LABEL_1', 'score': 0.971645712852478},
 {'label': 'LABEL_1', 'score': 0.9548541903495789},
 {'label': 'LABEL_1', 'score': 0.9701395034790039},
 {'label': 'LABEL_0'

In [21]:
pred_dataset["text"][:32]

['<TWEET>: » “I do love you. . I love you, <TARGET 1>. .”',
 "<TWEET>: <TARGET 1> apologizes for 'aggressive and crude' behavior toward women… URL bench #Lifestyle",
 '<TWEET>: No one should have to endure this kind of torture... Incredibly brave of @AnnabellSciorra to share her story of <TARGET 1>’s abuse URL\n\n<QUOTED TWEET>: Sciorra was still living in fear of <TARGET 1>, she said, and slept with a baseball bat by her bed. URL',
 '<TWEET>: "New <TARGET 1> sexual assault accusations emerge" via FOX NEWS URL URL',
 '<TWEET>: Yes this. <TARGET 1>, this clown, the entire frat house at Fox. How could we expect first female presidential candidate to get a fair shake?',
 '<TWEET>: Over 300 Claims Against <TARGET 1>! URL',
 '<TWEET>: Over 300 Claims Against <TARGET 1>! URL URL',
 '<TWEET>: Rose McGowan accuses <TARGET 1> of Rape - News Today - Breaking News URL via @YouTube',
 '<TWEET>: <TARGET 1>, Hefner and the Poor Excuse that Explains a Lot via @NYTimes URL',
 '<TWEET>: Ellen sexual ha

0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 7.80 GiB total capacity; 597.34 MiB already allocated; 9.56 MiB free; 660.00 MiB reserved in total by PyTorch)

In [27]:
dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /home/geev/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /home/geev/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


['<TWEET>: » “I do love you. . I love you, <TARGET 1>. .”',
 "<TWEET>: <TARGET 1> apologizes for 'aggressive and crude' behavior toward women… URL bench #Lifestyle",
 '<TWEET>: No one should have to endure this kind of torture... Incredibly brave of @AnnabellSciorra to share her story of <TARGET 1>’s abuse URL\n\n<QUOTED TWEET>: Sciorra was still living in fear of <TARGET 1>, she said, and slept with a baseball bat by her bed. URL',
 '<TWEET>: "New <TARGET 1> sexual assault accusations emerge" via FOX NEWS URL URL',
 '<TWEET>: Yes this. <TARGET 1>, this clown, the entire frat house at Fox. How could we expect first female presidential candidate to get a fair shake?',
 '<TWEET>: Over 300 Claims Against <TARGET 1>! URL',
 '<TWEET>: Over 300 Claims Against <TARGET 1>! URL URL',
 '<TWEET>: Rose McGowan accuses <TARGET 1> of Rape - News Today - Breaking News URL via @YouTube',
 '<TWEET>: <TARGET 1>, Hefner and the Poor Excuse that Explains a Lot via @NYTimes URL',
 '<TWEET>: Ellen sexual ha

In [29]:
dataset[0]

{'text': 'I admit, the great majority of films released before say 1933 are just not for me. Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931).<br /><br />So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. I did like the lead actors, but thought little of the film.<br /><br />One intriguing sequence. Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown. Was this an early variation of beefcake courtesy of Howard Hughes?',
 'label': -1}

In [26]:
pred_dataset

Dataset({
    features: ['tweet_id', 'user_id', 'text', 'body_target_mentions_validated', '__index_level_0__'],
    num_rows: 2061369
})

In [15]:
from torch.utils.data import Dataset
class MyDataSet(Dataset):
    def __init__ (self, df, X_col):
        '''
        So that we can move the entire dataset to the GPU.
        :param X: float32 data scaled numpy array
        :param y: float32 data scaled numpy vector
        :param device: 'cpu' or 'cuda:0'
        '''
        self.X = list(df[X_col])
        # y vector needs to be in a column vector (or at least it
        # did in the normal dataset.)
#         self.y = torch.from_numpy(df[y_col])

    def __len__(self):
        return list(self.X.size())[0]

    def __getitem__(self, item):
        return self.X[item]

In [16]:
X_col = "clean_tweet_masked"

In [17]:
pred_df[X_col]

0          <TWEET>: » “I do love you. . I love you, <TARG...
4          <TWEET>: <TARGET 1> apologizes for 'aggressive...
5          <TWEET>: No one should have to endure this kin...
6          <TWEET>: "New <TARGET 1> sexual assault accusa...
8          <TWEET>: Yes this. <TARGET 1>, this clown, the...
                                 ...                        
4683894    <TWEET>: Democratic National Committee to Give...
4683895    <TWEET>: We are also tired of Hollywood; they ...
4683898    <TWEET>: <TARGET 1>'s photobombing. The ladies...
4683901    <TWEET>: Lisa Bloom, Lawyer Advising <TARGET 1...
4683902    <TWEET>: Don Jr. trolls silent Hillary over <T...
Name: clean_tweet_masked, Length: 2061369, dtype: object

In [18]:
pred_dataset = MyDataSet(pred_df, "clean_tweet_masked")

In [19]:
loader = DataLoader(pred_dataset,batch_size=64,shuffle=False, num_workers=1)

In [20]:
for X in loader:
    print(X)
    print("-------")

AttributeError: 'list' object has no attribute 'size'

In [54]:
pred_data.columns

Index(['_id', 'body', 'postedTime', 'retweetCount', 'favoritesCount',
       'quoted_status_id', 'quoted_status_user_id', 'quoted_status_body',
       'quoted_status_user_postedTime', 'gnip_url_title',
       'gnip_url_description', 'is_RT', 'RT_body', 'RT_user_id', 'RT_id',
       'tweet_id', 'user_id', 'RT_target_mentions', 'body_target_mentions',
       'gnip_url_title_mentions', 'lang_pred', 'lang_pred_prob',
       'quoted_status_target_mentions', 'RT_target_mentions_metoo',
       'all_fields_n_targets', 'body_target_mentions_metoo',
       'body_target_mentions_n_targets', 'body_target_mentions_target',
       'gnip_url_title_mentions_metoo', 'quoted_status_target_mentions_metoo',
       'masked_body', 'propn_masked_body', 'body_target_mentions_validated',
       'body_target_mentions_validated_true', 'clean_targets_n', 'clean_tweet',
       'clean_tweet_masked', 'mask_map', 'modified_quote_tweet', 'time_check',
       'all_target_mentions_metoo', 'gnip_url_title_mentions_n_targ

In [None]:
pipe()

In [55]:
cleaned_masked_texts = pred_data.clean_tweet_masked
texts

0          <TWEET>: » “I do love you. . I love you, <TARG...
1                                                        NaN
2                                                        NaN
3                                                        NaN
4          <TWEET>: <TARGET 1> apologizes for 'aggressive...
                                 ...                        
4683900    <TWEET>: But but but dude you're a rapist HOW ...
4683901    <TWEET>: Lisa Bloom, Lawyer Advising <TARGET 1...
4683902    <TWEET>: Don Jr. trolls silent Hillary over <T...
4683903                                                  NaN
4683904                                                  NaN
Name: clean_tweet_masked, Length: 4683905, dtype: object

In [56]:
pred_dataset = Dataset.from_pandas(pred_df)

In [57]:
pred_dataset

Dataset({
    features: ['tweet_id', 'user_id', 'clean_tweet_masked', 'body_target_mentions_validated', '__index_level_0__'],
    num_rows: 2061369
})

In [66]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["clean_tweet_masked"], truncation=True)

tokenized_datasets = pred_dataset.map(tokenize_function, batched=True)


  0%|          | 0/2062 [00:00<?, ?ba/s]

AttributeError: 'DataCollatorWithPadding' object has no attribute 'cpu'

In [67]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Loading the model

In [68]:
tokenized_datasets

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'body_target_mentions_validated', 'clean_tweet_masked', 'input_ids', 'token_type_ids', 'tweet_id', 'user_id'],
    num_rows: 2061369
})

In [21]:
model_path = "./sample_model.p"
model = torch.load(model_path)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [47]:
tokenized_datasets

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'body_target_mentions_validated', 'input_ids', 'token_type_ids', 'tweet_id', 'user_id'],
    num_rows: 2061369
})

In [48]:
tokenized_datasets = tokenized_datasets.remove_columns(["body_target_mentions_validated"])
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [49]:
test_dataloader = DataLoader(tokenized_datasets, shuffle=False, batch_size=8)

In [50]:
for batch in test_dataloader:
    print(batch)
    break

TypeError: new(): invalid data type 'numpy.str_'