In [1]:
from main import CollocateTokenSelector, BERTReplacer, BERTTokenSelector
from main import Humourizer

import pandas as pd

import numpy as np

In [3]:
token_selector = CollocateTokenSelector('CM_SpaCy_truecased')
token_replacer = BERTReplacer(bert_model_path='bert_masked_lm_full_model',
                             bert_tokenizer="bert-large-uncased-whole-word-masking",
                             verbose=True, k=3)
humourizer = Humourizer(token_selector, token_replacer, verbose=True, score=True)

In [2]:
df = pd.read_excel("thousand_lines_truecased.xlsx", engine="openpyxl",
                  index_col='Unnamed: 0').dropna(subset=['headline'])

In [4]:
df.head()

Unnamed: 0,id,title_orig_case,content,source,headline
58347,52354,Seattle synagogue vandalism denies Holocaust,[ (CNN)A synagogue in Seattle has become the l...,AllTheNewsComponentsOne,Seattle synagogue vandalism DENIES holocaust
71341,65485,Ford just invested $1 billion in a secretive A...,"['', 'Ford is investing $1 billion in a secret...",AllTheNewsComponentsOne,Ford just invested$ 1 billion in a secretive a...
55118,49079,'Star Wars' Actress Carrie Fisher Dies at Age ...,"Carrie Fisher, the actress best known for play...",AllTheNewsComponentsOne,' Star wars' actress Carrie Fisher dies at age...
102753,108886,Here Are All Of The Victims In The Orlando Ni...,[' At least 50 people were killed and 53 injur...,AllTheNewsComponentsOne,Here are all of the victims in the Orlando nig...
328613,168132,"Despite The Math, Bernie Sanders Has Already Won",Bernie Sanders scored victories Saturday in ...,AllTheNewsKaggle,"Despite the math, Bernie Sanders has already won"


In [6]:
headlines = df['headline'].sample(n=100, random_state=42)

In [11]:
%%time
df_out = humourizer.vandalize_headlines(headlines, return_pandas=True)

Making predictions...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 621/621 [07:35<00:00,  1.36it/s]


Scoring examples...


  0%|                                                                                                                                                  | 0/635 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 635/635 [09:38<00:00,  1.10it/s]


Wall time: 21min 11s


А с BERTSelector'ом?

In [4]:
token_selector = BERTTokenSelector(bert_selector_model='bert-large-uncased-whole-word-masking',
                                   bert_tokenizer='bert-large-uncased-whole-word-masking',
                                   colloc_thresh=0.3, verbose=True)
humourizer = Humourizer(token_selector, token_replacer, verbose=True, score=True)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
%%time
df_out = humourizer.vandalize_headlines(headlines, return_pandas=True)

  2%|█▎                                                                            | 17/999 [01:13<1:10:44,  4.32s/it]


KeyboardInterrupt: 

In [12]:
df_out.columns

Index(['headline', 'masked', 'tokenized', 'span_index', 'span', 'predicted',
       'new span', 'predicted_score'],
      dtype='object')

In [13]:
df_out['predicted_score'].mean()

0.04627715354510295

In [17]:
for i in np.arange(0, 5, 0.5):
    print(i)

0.0
0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5


Долго ждать - часа 4...

In [7]:
%%time
cross_tab = []

for i in np.arange(0, 10, 0.5):
    token_selector = CollocateTokenSelector('CM_SpaCy_truecased', thresh=i)
    humourizer = Humourizer(token_selector, token_replacer, verbose=True, score=True)
    df_out = humourizer.vandalize_headlines(headlines, return_pandas=True)
    cross_tab.append({'colloc_thresh':i, 'mean_colbert_score': df_out['predicted_score'].mean(),
                      'n': len(df_out['predicted'].notna()),
                       '0.5': df_out['predicted_score'].quantile(0.5),
                       '0.6': df_out['predicted_score'].quantile(0.6),
                       '0.7': df_out['predicted_score'].quantile(0.7),
                       '0.8': df_out['predicted_score'].quantile(0.8),
                       '0.9': df_out['predicted_score'].quantile(0.9)})

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 73/73 [00:52<00:00,  1.40it/s]


Scoring examples...


  0%|                                                                                                                                                                  | 0/73 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 73/73 [01:18<00:00,  1.08s/it]
  0%|                                                                                                                                                                  | 0/68 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 68/68 [01:04<00:00,  1.05it/s]


Scoring examples...


  0%|                                                                                                                                                                  | 0/69 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 69/69 [01:46<00:00,  1.54s/it]
  0%|                                                                                                                                                                  | 0/67 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [01:34<00:00,  1.41s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/68 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 68/68 [01:08<00:00,  1.01s/it]
  0%|                                                                                                                                                                  | 0/67 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [01:25<00:00,  1.28s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/68 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 68/68 [01:13<00:00,  1.08s/it]
  0%|                                                                                                                                                                  | 0/66 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [01:26<00:00,  1.32s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/67 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [01:28<00:00,  1.32s/it]
  0%|                                                                                                                                                                  | 0/64 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [01:52<00:00,  1.76s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/65 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65/65 [01:18<00:00,  1.20s/it]
  0%|                                                                                                                                                                  | 0/62 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [01:26<00:00,  1.39s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/63 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [01:20<00:00,  1.28s/it]
  0%|                                                                                                                                                                  | 0/60 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [01:19<00:00,  1.33s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/61 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61/61 [01:10<00:00,  1.16s/it]
  0%|                                                                                                                                                                  | 0/58 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [01:52<00:00,  1.94s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/59 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 59/59 [00:58<00:00,  1.00it/s]
  0%|                                                                                                                                                                  | 0/55 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55/55 [01:27<00:00,  1.59s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/56 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [01:23<00:00,  1.48s/it]
  0%|                                                                                                                                                                  | 0/52 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52/52 [01:49<00:00,  2.11s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/53 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:58<00:00,  1.10s/it]
  0%|                                                                                                                                                                  | 0/48 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [01:54<00:00,  2.39s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/49 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [01:09<00:00,  1.42s/it]
  0%|                                                                                                                                                                  | 0/44 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [02:23<00:00,  3.25s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/45 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [01:01<00:00,  1.37s/it]
  0%|                                                                                                                                                                  | 0/39 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [02:03<00:00,  3.16s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/41 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:51<00:00,  1.26s/it]
  0%|                                                                                                                                                                  | 0/34 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [01:29<00:00,  2.64s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/36 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:44<00:00,  1.24s/it]
  0%|                                                                                                                                                                  | 0/29 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [01:08<00:00,  2.35s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/31 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:43<00:00,  1.41s/it]
  0%|                                                                                                                                                                  | 0/23 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [01:58<00:00,  5.15s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:33<00:00,  1.34s/it]
  0%|                                                                                                                                                                  | 0/20 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [01:16<00:00,  3.84s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/21 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:42<00:00,  2.01s/it]
  0%|                                                                                                                                                                  | 0/16 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:42<00:00,  2.64s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/17 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:35<00:00,  2.07s/it]
  0%|                                                                                                                                                                  | 0/13 [00:00<?, ?it/s]

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [01:01<00:00,  4.73s/it]


Scoring examples...


  0%|                                                                                                                                                                  | 0/14 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:24<00:00,  1.75s/it]


Wall time: 2h 4min 19s


In [8]:
cross_tab

[{'colloc_thresh': 0.0,
  'mean_colbert_score': 0.034910054397423666,
  'n': 100,
  '0.5': 1.1549821465450805e-05,
  '0.6': 1.2704665823548567e-05,
  '0.7': 1.417527246303507e-05,
  '0.8': 2.3697155120316894e-05,
  '0.9': 0.0003856647119391761},
 {'colloc_thresh': 0.5,
  'mean_colbert_score': 0.034978800929095104,
  'n': 99,
  '0.5': 1.1536075362528209e-05,
  '0.6': 1.2787745254172476e-05,
  '0.7': 1.4262633885664399e-05,
  '0.8': 2.266867668367923e-05,
  '0.9': 0.00020716196158900865},
 {'colloc_thresh': 1.0,
  'mean_colbert_score': 0.034978800929095104,
  'n': 99,
  '0.5': 1.1536075362528209e-05,
  '0.6': 1.2787745254172476e-05,
  '0.7': 1.4262633885664399e-05,
  '0.8': 2.266867668367923e-05,
  '0.9': 0.00020716196158900865},
 {'colloc_thresh': 1.5,
  'mean_colbert_score': 0.034978799495235365,
  'n': 99,
  '0.5': 1.1518969586177263e-05,
  '0.6': 1.2787745254172476e-05,
  '0.7': 1.4262633885664399e-05,
  '0.8': 2.266867668367923e-05,
  '0.9': 0.00020716196158900865},
 {'colloc_thresh

In [5]:
token_selector = CollocateTokenSelector('CM_SpaCy_truecased', colloc_thresh=-5)
token_replacer = BERTReplacer(bert_model_path='bert_masked_lm_full_model',
                             bert_tokenizer="bert-large-uncased-whole-word-masking",
                             verbose=True, k=3)
humourizer = Humourizer(token_selector, token_replacer, verbose=True, score=True)

In [6]:
df_out = humourizer.vandalize_headlines(headlines, return_pandas=True)

Making predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:45<00:00,  1.41it/s]


Scoring examples...


  0%|                                                                                                                                                                  | 0/65 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65/65 [01:27<00:00,  1.35s/it]


In [7]:
df_out["predicted_score"].mean()

0.03627962686207866