In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from transformers import BertTokenizer
from transformers import TFAutoModel

In [2]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

In [6]:
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [7]:
df = df[['headline', 'category']]

In [8]:
df.sample(5)

Unnamed: 0,headline,category
87394,How Technology Is Disrupting Work and Educatio...,EDUCATION
61425,New York City's Public Advocate Just Told Hedg...,BUSINESS
8627,Judge Denies Rick Gates' Request To Celebrate ...,POLITICS
108100,Happy Birthday To Us,WOMEN
152364,"As Pay Cards Replace Paychecks, Bank Fees Hurt...",BUSINESS


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  200853 non-null  object
 1   category  200853 non-null  object
dtypes: object(2)
memory usage: 3.1+ MB


In [10]:
len(max((df['headline'])))

69

In [11]:
seq_len = 70
num_samples = len(df)

Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

Xids.shape

(200853, 70)

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, text in enumerate(df['headline']):
    tokens = tokenizer.encode_plus(text, max_length=seq_len, truncation=True, padding='max_length', add_special_tokens=True, 
                                   return_tensors='tf')
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

In [13]:
Xids

array([[  101.,  1247.,  8640., ...,     0.,     0.,     0.],
       [  101.,  3100.,  2159., ...,     0.,     0.,     0.],
       [  101.,  6016.,  4468., ...,     0.,     0.,     0.],
       ...,
       [  101.,  7783.,  3278., ...,     0.,     0.,     0.],
       [  101.,  2586.,  3842., ...,     0.,     0.,     0.],
       [  101., 14868.,  4115., ...,     0.,     0.,     0.]])

In [14]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [15]:
df['category'].unique()

array(['CRIME', 'ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS',
       'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY', 'QUEER VOICES',
       'SPORTS', 'BUSINESS', 'TRAVEL', 'MEDIA', 'TECH', 'RELIGION',
       'SCIENCE', 'LATINO VOICES', 'EDUCATION', 'COLLEGE', 'PARENTS',
       'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE', 'HEALTHY LIVING',
       'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST', 'FIFTY', 'ARTS',
       'WELLNESS', 'PARENTING', 'HOME & LIVING', 'STYLE & BEAUTY',
       'DIVORCE', 'WEDDINGS', 'FOOD & DRINK', 'MONEY', 'ENVIRONMENT',
       'CULTURE & ARTS'], dtype=object)

In [16]:
arr = df['category'].values
arr

array(['CRIME', 'ENTERTAINMENT', 'ENTERTAINMENT', ..., 'SPORTS', 'SPORTS',
       'SPORTS'], dtype=object)

In [17]:
label_dict = {'CRIME': 0, 'ENTERTAINMENT': 1, 'WORLD NEWS': 2, 'IMPACT': 3, 'POLITICS': 4,
       'WEIRD NEWS': 5, 'BLACK VOICES': 6, 'WOMEN': 7, 'COMEDY': 8, 'QUEER VOICES':9,
       'SPORTS':10, 'BUSINESS': 11, 'TRAVEL': 12, 'MEDIA': 13, 'TECH': 14, 'RELIGION': 14,
       'SCIENCE': 15, 'LATINO VOICES': 16, 'EDUCATION': 17, 'COLLEGE': 18, 'PARENTS': 19,
       'ARTS & CULTURE': 20, 'STYLE': 21, 'GREEN': 22, 'TASTE': 23, 'HEALTHY LIVING': 24,
       'THE WORLDPOST': 25, 'GOOD NEWS': 26, 'WORLDPOST': 27, 'FIFTY': 28, 'ARTS': 29,
       'WELLNESS': 30, 'PARENTING': 31, 'HOME & LIVING': 32, 'STYLE & BEAUTY': 33,
       'DIVORCE': 34, 'WEDDINGS': 35, 'FOOD & DRINK': 36, 'MONEY': 37, 'ENVIRONMENT': 38,
       'CULTURE & ARTS': 39}

for i, val in enumerate(arr):
    arr[i] = int(label_dict[val])

In [18]:
arr = arr.astype('int32')

In [19]:
arr

array([ 0,  1,  1, ..., 10, 10, 10])

In [20]:
labels = np.zeros((num_samples, arr.max()+1))
labels.shape

(200853, 40)

In [21]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
labels[np.arange(num_samples), arr] = 1

In [23]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))
dataset.take(1)

<TakeDataset shapes: ((70,), (70,), (40,)), types: (tf.float64, tf.float64, tf.float64)>

In [24]:
labels[0, :].shape

(40,)

In [25]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [26]:
dataset = dataset.map(map_func)

In [27]:
dataset.take(1)

<TakeDataset shapes: ({input_ids: (70,), attention_mask: (70,)}, (40,)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [28]:
batch_size=16

dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

<TakeDataset shapes: ({input_ids: (16, 70), attention_mask: (16, 70)}, (16, 40)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [29]:
split = 0.9
size = int((num_samples / batch_size) * split)

In [30]:
train_df = dataset.take(size)
val_df = dataset.skip(size)

del dataset

In [29]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

bert.summary()

Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [30]:
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]

x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(512, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
y = tf.keras.layers.Dense(arr.max()+1, activation='softmax', name='outputs')(x)

In [31]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
#model.layers[2].trainable = False
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 70)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 70)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          ((None, 70, 768), (N 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 1024)         787456      bert[0][1]            

In [32]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [33]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [31]:
history = model.fit(
    train_df,
    validation_data=val_df, 
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [35]:
model.save('news_model.h5')

---

In [4]:
#import tensorflow as tf

model = tf.keras.models.load_model('news_model.h5')

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 70)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 70)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          ((None, 70, 768), (N 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 1024)         787456      bert[0][1]            