## 1. Data preprocess

The MIND dataset is highly unbalanced, so we only pick the categories with around 1000 news to perform classification

Dataset download from: https://msnews.github.io/

In [9]:
import csv
with open ("data/MINDlarge_train/news.tsv", encoding="utf8")  as f:
    read_tsv = csv.reader(f, delimiter="\t")
    news = [[new[1],new[2],new[3],new[4]] for new in read_tsv]
categories = set([new[1] for new in news])       

In [10]:
categories = set([new[0] for new in news])     
categories

{'autos',
 'entertainment',
 'finance',
 'foodanddrink',
 'games',
 'health',
 'kids',
 'lifestyle',
 'middleeast',
 'movies',
 'music',
 'news',
 'northamerica',
 'sports',
 'travel',
 'tv',
 'video',
 'weather'}

In [11]:
news_classified = []
for i,cat in enumerate(categories):
    if(cat == "kids" or cat =="northamerica" or cat == "games" or cat =="middleeast"):
        continue
    news_classified.append([new for new in news if new[0] == cat]) 

In [12]:
for new in news_classified:
    print(new[0][0],":",len(new))

foodanddrink : 4418
video : 4569
music : 1263
tv : 1323
news : 30478
travel : 4955
lifestyle : 4570
weather : 4255
autos : 3071
sports : 32020
health : 2929
finance : 5916
movies : 815
entertainment : 837


In [22]:
news_classified

['news',
 'newspolitics',
 "Giuliani's Ukraine Team: In Search of Influence, Dirt and Money",
 "When Rudolph W. Giuliani set out to dredge up damaging information on President Trump's rivals in Ukraine, he turned to a native of the former Soviet republic with whom he already had a lucrative business relationship. Lev Parnas, a Ukrainian-American businessman with a trail of debts and lawsuits, had known Mr. Giuliani casually for years."]

In [8]:
# store the training data
with open('data/BERT_VEC/train.csv', 'w',encoding="utf8",newline='') as f: 
      
    # using csv.writer method from CSV package 
    write = csv.writer(f) 
    fields = ['category','title','summary']  
    write.writerow(fields) 
    for new in news_classified:
        write.writerows(new)

Here we load the preprocessed dataset

In [23]:
import csv
with open ("data/BERT_VEC/train.csv",'r',encoding="utf8") as f:
    read_csv = csv.reader(f, delimiter=',')
    next(read_csv)
    news_train = [new for new in read_csv]
categories = set([new[0] for new in news_train])  

In [24]:
news_train[:5]

[['foodanddrink',
  'recipes',
  "This Roasted Squash Panzanella Is the Perfect Way to Start This Year's Christmas Dinner",
  'Introducing the perfect way to balance out your sugar cookie obsession.'],
 ['foodanddrink',
  'newstrends',
  'Nashville restaurants: Ms. Cheap rounds up lunch deals for $10 or less',
  'Ms. Cheap rounds up Nashville restaurant lunch deals with good food, good prices   and, in most cases, free parking.'],
 ['foodanddrink',
  'recipes',
  '25 Last-Minute Ideas That Will Absolutely Save Your Holiday Dinner',
  'These last-minute ideas will make you excited for the (sometimes-stressful) holiday. The post 25 Last-Minute Ideas That Will Absolutely Save Your Holidays appeared first on Taste of Home.'],
 ['foodanddrink',
  'recipes',
  '20 Must-Try Gravy Recipes for Your Thanksgiving Feast',
  "Turkey just isn't the same without it."],
 ['foodanddrink',
  'recipes',
  'These Cranberry Sauce Recipes Are Perfect for Thanksgiving Dinner',
  "You'll never want the store-

In [None]:
# for Colab
#!pip install transformers

The script of using bert is from https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

In [25]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
#% matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [26]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
         output_hidden_states = True, # Whether the model returns all hidden-states.
        )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [27]:
# read the text
def preprocess(text_a,text_b):
    #text = "Here is the sentence I want embeddings for."
    marked_text = "[CLS] " + text_a + " [SEP]"+text_b
    
    # Tokenize our sentence with the BERT tokenizer.
    tokenized_text = tokenizer.tokenize(marked_text)
    if len(tokenized_text) >521:
        tokenized_text = tokenized_text[:512]
    
    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    # Mark each of the 22 tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    #if len(tokens_tensor[0]) > 512:
    #    tokens_tensor[0] = tokens_tensor[0][:512]
    #    segments_tensors[0] = segments_tensors[0][:512]
    
    return tokens_tensor, segments_tensors

In [28]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 

def get_sen_vec(tokens_tensor, segments_tensors):
    
    with torch.no_grad():
    
        outputs = model(tokens_tensor, segments_tensors)
    
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        
    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]
    
    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding.numpy()

In [29]:
import numpy as np
vectors = []
for i in range(len(news_train)):
    tokens, segments = preprocess(news_train[i][1],news_train[i][2])
    sen_vec = get_sen_vec(tokens, segments)
    #assert(sen_vec.shape[0] ==768 )
    vectors.append(sen_vec)

In [30]:
vectors = np.array(vectors)
vectors.shape

(101419, 768)

In [31]:
np.save("train.npy", np.array(vectors))