In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import pickle
import pandas as pd
from datasets import Value
from torch.utils.data import DataLoader
import argparse
import os
from pymongo import MongoClient
import IPython
from tqdm import tqdm

In [2]:
client = MongoClient()
db_metoo_tweets = client["jason_twitter"]
metoo_tweets = db_metoo_tweets.tweets

In [6]:
tweet_cursor = metoo_tweets.find({"condemnation_prediction":1})

In [7]:
checkpoint = "bert-base-uncased"
model_path = "./models/hf"

# model = torch.load(model_path)
# model.eval()

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [8]:
def split_list(cursor, n):
        result = []
        for tweet in tqdm(cursor):
            result.append(tweet)
            if len(result)==n:
                result_to_return = result
                result = []
                yield result_to_return
        yield result
        
def update_tweet_in_db( document):
    try:
        metoo_tweets.update_one(
            {'_id': document['_id']},
            {'$set': document}
        )
    except Exception:
        print("couldn't update ", document)

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="exp/bart/results",
    do_train=False,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
)
# training_args = TrainingArguments("test-trainer")
# training_args.eval_accumulation_steps = 1  # pushes predictions out of GPU to mitigate GPU out of memory

trainer = Trainer(
    model = AutoModelForSequenceClassification.from_pretrained("models/hf/hf_fold_1_model"),
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [13]:
n=100000
target_col = "clean_tweet"
for idx, chunk in tqdm(enumerate(split_list(tweet_cursor, n))):
        df = pd.DataFrame(chunk)
        if not len(df):
            break
        pred_df = df[[target_col]].dropna()
        pred_data = df.dropna(subset=[target_col])
        pred_dataset = Dataset.from_pandas(pred_df)
        pred_dataset = pred_dataset.rename_column(target_col, "text")
        tokenized_datasets = pred_dataset.map(tokenize_function, batched=True)
        predictions_logits = trainer.predict(tokenized_datasets)
        preds = np.argmax(predictions_logits.predictions, axis = 1)
        save_df = pred_data[["_id", "clean_tweet"]]
        save_df["severity_prediction"] = preds
        save_df["severity_logit_0"] = predictions_logits.predictions[:, 0]
        save_df["severity_logit_1"] = predictions_logits.predictions[:, 1]
        save_df["severity_logit_2"] = predictions_logits.predictions[:, 1]
    
        del tokenized_datasets
        del pred_dataset
        for idx, row in tqdm(save_df.iterrows()):
            update_tweet_in_db(row.to_dict())
        

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
102it [00:00, 184.07it/s][A
3547it [00:01, 4008.09it/s][A
6638it [00:01, 4840.61it/s][A
9732it [00:01, 6598.38it/s][A
12829it [00:02, 5849.06it/s][A
15939it [00:03, 5065.47it/s][A
19167it [00:03, 6477.35it/s][A
22427it [00:04, 5255.63it/s][A
25701it [00:04, 6602.52it/s][A
28961it [00:04, 7641.65it/s][A
32151it [00:05, 5234.58it/s][A
35338it [00:06, 6238.40it/s][A
38617it [00:06, 7290.98it/s][A
41889it [00:07, 4520.23it/s][A
45149it [00:08, 5576.97it/s][A
48319it [00:08, 6683.60it/s][A
51401it [00:08, 7761.89it/s][A
54609it [00:10, 4351.20it/s][A
57887it [00:10, 5577.87it/s][A
61166it [00:10, 6895.77it/s][A
64517it [00:10, 8327.66it/s][A
67957it [00:10, 9675.02it/s][A
71353it [00:12, 4244.48it/s][A
74757it [00:12, 5432.61it/s][A
78113it [00:13, 6769.50it/s][A
81474it [00:13, 8139.28it/s][A
84890it [00:13, 9593.68it/s][A
88266it [00:13, 10766.77it/s][A
91656it [00:16, 3707.29it/s] [A
95027it [00:16, 4800.72it/s][A
983

  0%|          | 0/100 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128



0it [03:11, ?it/s]35.22it/s][A
99999it [03:11, 523.14it/s] 


In [25]:
save_df = pred_data[["_id", "clean_tweet"]]
save_df["severity_prediction"] = preds
save_df["severity_logit_0"] = predictions_logits.predictions[:, 0]
save_df["severity_logit_1"] = predictions_logits.predictions[:, 1]
save_df["severity_logit_2"] = predictions_logits.predictions[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [26]:
save_df

Unnamed: 0,_id,clean_tweet,severity_prediction,severity_logit_0,severity_logit_1,severity_logit_2
0,6383e2adfa2b796ff3841c3f,<TWEET>: The latest #Predator movie raised a l...,1,-1.418743,0.922649,0.922649
1,6383e2adfa2b796ff3841c46,<TWEET>: Don't cast your child molesting pedop...,2,-1.805397,0.236971,0.236971
2,6383e2adfa2b796ff3841c4b,<TWEET>: I just saw #ThePredator’s promoted tw...,1,-1.350736,0.753730,0.753730
3,6383e2adfa2b796ff3841c4c,<TWEET>: Perhaps if Steven Wilder Striegel had...,1,-0.378258,0.814329,0.814329
4,6383e2adfa2b796ff3841c4e,"<TWEET>: The former teen known as Jane Doe, no...",1,-0.725376,1.061773,1.061773
...,...,...,...,...,...,...
99995,6383e319fa2b796ff38a1f27,"<TWEET>: Wait, so Kevin Spacey's response to b...",1,-0.849389,0.973997,0.973997
99996,6383e319fa2b796ff38a1f29,<TWEET>: I see Kevin Spacey has come out as a ...,1,-1.355826,0.965941,0.965941
99997,6383e319fa2b796ff38a1f2e,<TWEET>: Why does Kevin Spacey have to be ruin...,1,-0.044263,0.708245,0.708245
99998,6383e319fa2b796ff38a1f31,<TWEET>: Bingo!(this is the 1 that came forwar...,1,-1.326478,1.006863,1.006863


In [24]:
preds[:400]

array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1,
       1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1,
       1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2,
       1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2,
       2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2,
       2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2,
       1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 0, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2,
       2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1,
       2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1,

In [14]:
predictions_logits

PredictionOutput(predictions=array([[-1.4187431 ,  0.9226494 , -0.09052195],
       [-1.8053972 ,  0.23697093,  1.5364958 ],
       [-1.3507361 ,  0.7537301 ,  0.0857865 ],
       ...,
       [-0.04426257,  0.7082449 , -0.8602782 ],
       [-1.3264781 ,  1.0068629 ,  0.06774779],
       [-0.56836873,  0.97940767, -0.70287985]], dtype=float32), label_ids=None, metrics={'test_runtime': 169.983, 'test_samples_per_second': 588.294, 'test_steps_per_second': 2.3})

In [4]:
file = open('../../data/5_mil_7days_metoo.p', 'rb')
pred_data = pickle.load(file)
file.close()
pred_df = pred_data[["clean_tweet_masked"]].dropna()
pred_df

Unnamed: 0,clean_tweet_masked
0,"<TWEET>: » “I do love you. . I love you, <TARG..."
4,<TWEET>: <TARGET 1> apologizes for 'aggressive...
5,<TWEET>: No one should have to endure this kin...
6,"<TWEET>: ""New <TARGET 1> sexual assault accusa..."
8,"<TWEET>: Yes this. <TARGET 1>, this clown, the..."
...,...
4683897,<TWEET>: On the one year anniversary of the Ac...
4683898,<TWEET>: <TARGET 1>'s photobombing. The ladies...
4683900,<TWEET>: But but but dude you're a rapist HOW ...
4683901,"<TWEET>: Lisa Bloom, Lawyer Advising <TARGET 1..."


In [6]:
pred_df = pred_df.head(1000)

In [7]:
from datasets import Value
pred_dataset = Dataset.from_pandas(pred_df)
pred_dataset = pred_dataset.rename_column("clean_tweet_masked", "text")

In [8]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)
tokenized_datasets = pred_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
model_path = "./sample_model.p"
model = torch.load(model_path)
model.eval()
# model.to("cpu")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [13]:
from transformers import Trainer
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="exp/bart/results",
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=100,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
)
training_args = TrainingArguments("test-trainer")
training_args.eval_accumulation_steps=1 #pushes predictions out of GPU to mitigate GPU out of memory

trainer = Trainer(
    model,
    args=args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [15]:
predictions = trainer.predict(tokenized_datasets)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


In [None]:
with open('./severity_predictions', 'wb') as f:
    pickle.dump(predictions, f)