# Test datasets for BertNet

Yao Fu. University of Edinburgh<br />
yao.fu@ed.ac.uk<br />
Jun 2021

# 20 News

In [51]:
import spacy
import numpy as np 

from sklearn.datasets import fetch_20newsgroups
from transformers import BertTokenizer
from collections import Counter
from tqdm import tqdm

# Use Spacy to process document into sentences

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fd6fd696e60>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fd6fd522f68>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fd6fd2eaad8>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fd6fd2eac10>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fd6fd560848>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fd6fd54b1c8>)]

In [5]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fd6fd444ac8>

In [6]:
data = fetch_20newsgroups(subset='all')['data']

In [8]:
docs = []
for doc in tqdm(nlp.pipe(data, disable=['tok2vec', 'parser', 'ner'])):
    docs.append(doc)

18846it [04:51, 64.75it/s] 


In [85]:
list(docs[1].sents)[2]

 Does anyone have suggestions/ideas on:

  - Diamond Stealth Pro Local Bus

  - Orchid Farenheit 1280

  - ATI Graphics Ultra Pro

  - Any other high-performance VLB card


Please post or email.

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [20]:
' '.join(tokenizer.convert_ids_to_tokens(tokenizer(str(list(docs[1].sents)[2]))['input_ids']))

'[CLS] does anyone have suggestions / ideas on : - diamond stealth pro local bus - orchid fare ##nh ##eit 128 ##0 - at ##i graphics ultra pro - any other high - performance v ##lb card please post or email . [SEP]'

In [25]:
tokenized_sentences = []
for d in tqdm(docs):
    for s in list(d.sents):
        tokens = tokenizer(str(s))['input_ids']
        tokenized_sentences.append(tokens)

100%|██████████| 18846/18846 [03:03<00:00, 102.45it/s]


In [107]:
' '.join(tokenizer.convert_ids_to_tokens(tokenized_sentences[0]))

'[CLS] from : mama ##tha devin ##eni rat ##nam < mr ##47 + @ andrew . cm ##u . ed ##u > subject : pens fans reactions organization : post office , carnegie mellon , pittsburgh , pa lines : 12 n ##nt ##p - posting - host : po ##4 . andrew . cm ##u . ed ##u i am sure some bash ##ers of pens fans are pretty confused about the lack of any kind of posts about the recent pens massacre of the devils . [SEP]'

In [70]:
tokenizer.decode(tokenized_sentences[1000][1:-1])

"then you can use asynch sndplay's all you want."

# Tokenized Sentences Statistics

In [38]:
len(tokenized_sentences)

311862

## Sentence Length

In [52]:
lens = []
for s in tokenized_sentences:
    lens.append(len(s))
lens = np.array(lens)

In [65]:
lens.sum(), lens[lens < 50].sum()

(11004425, 5547067)

In [64]:
len(lens), (lens < 50).sum()

(311862, 265640)

## Effective Vocabulary

In [108]:
vocab = []
for s in tokenized_sentences:
    vocab.extend(s)
vocab = Counter(vocab)
print(len(vocab))

25807


# Store sentences into file

In [102]:
with open('../data/news/20news.txt', 'w', encoding='utf-8') as fd:
    for s in tqdm(tokenized_sentences):
        slen = len(s)
        if(slen < 50 and slen > 2):
            s_ = tokenizer.decode(s[1:-1]).replace('*', '').replace('>', '').replace('-', '').replace('<', '').replace('|', '').replace('^', '')
            s_ = ' '.join(s_.split()).strip()
            fd.write(s_)
            fd.write('\n')

100%|██████████| 311862/311862 [00:19<00:00, 15684.28it/s]


In [105]:
tokenizer(lines[:2])

{'input_ids': [[101, 2941, 1010, 1045, 2572, 2978, 14909, 2205, 1998, 1037, 2978, 7653, 1012, 102], [101, 2174, 1010, 1045, 2572, 2183, 2000, 2404, 2019, 2203, 2000, 2512, 6278, 2545, 1005, 4335, 2007, 1037, 2978, 1997, 8489, 2005, 1996, 25636, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [46]:
lens_cnt = Counter(lens)

In [18]:
tokenizer(data[0:5], padding=True, return_tensors='pt')['input_ids'].size()

torch.Size([5, 1007])