In [2]:
import tensorflow_hub as hub
import tensorflow_text as text

In [3]:
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

In [4]:
preprocessed_bert = hub.KerasLayer(preprocess_url)
sample_text = ['this is a great world', 'the world is a planet']
prebert_text = preprocessed_bert(sample_text)
prebert_text.keys()

dict_keys(['input_mask', 'input_word_ids', 'input_type_ids'])

In [7]:
bert_model = hub.KerasLayer(encoder_url)
output = bert_model(prebert_text)
output.keys()

dict_keys(['default', 'pooled_output', 'encoder_outputs', 'sequence_output'])

In [13]:
output['pooled_output']

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.9027212 , -0.307875  , -0.00927441, ...,  0.07802372,
        -0.6742198 ,  0.91931635],
       [-0.8037597 , -0.18468803,  0.5547034 , ...,  0.4293008 ,
        -0.5631156 ,  0.7972579 ]], dtype=float32)>

# BBC NEWS CLASSIFICATION

In [15]:
import pandas as pd 

In [17]:
data = pd.read_csv('bbc_text.csv')
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [21]:
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

pt = PorterStemmer()

def simple_tokenize(doc):
    token = word_tokenize(doc) 
    stemed = [d for d in token if not d in string.punctuation]
    return stemed

data['cl_text']  = data['text'].apply(lambda x: simple_tokenize(x))

In [22]:
data.head()

Unnamed: 0,category,text,cl_text
0,tech,tv future in the hands of viewers with home th...,"[tv, future, in, the, hands, of, viewers, with..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, of, farrell, gamble, leicester,..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, in, fa, cup, premie..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, twelve, raids, box, office, ocean, ..."


In [29]:
from sklearn.preprocessing import LabelEncoder

train_doc = data['cl_text']
label_encode = LabelEncoder()
y_data = label_encode.fit_transform(data['category'])
y_data

array([4, 0, 3, ..., 1, 2, 3])

In [53]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_doc , y_data)
X_train[:10]

563     [saab, to, build, cadillacs, in, sweden, gener...
1721    [tory, leader, quits, legal, position, david, ...
722     [rochus, shocks, coria, in, auckland, top, see...
1702    [call, to, overhaul, uk, state, pension, the, ...
1068    [angry, williams, rejects, criticism, serena, ...
764     [blunkett, tells, of, love, and, pain, david, ...
420     [small, firms, hit, by, rising, costs, rising,...
1661    [drink, remark, acts, as, diversion, the, firs...
1008    [corry, backs, skipper, robinson, england, for...
107     [lit, idol, begins, search, for, author, the, ...
Name: cl_text, dtype: object

In [41]:
def get_bert_embedding(sample):
    
#   preprocing using the url preprocessed.... 
    preprocessor = hub.KerasLayer(preprocess_url)
    preprocessed_bertdoc = preprocessor(sample)
    
#   bert model ....
    bert_encoder = hub.KerasLayer(encoder_url)
    output_encode = bert_encoder(preprocessed_bertdoc)
    output = output_encode['pooled_output']
    return output
    

In [47]:
dc =  ['mango','orange' , 'goat']
e  = get_bert_embedding(dc)
e

<tf.Tensor: shape=(3, 768), dtype=float32, numpy=
array([[-0.7128861 , -0.15463905,  0.38401678, ...,  0.35278732,
        -0.5099134 ,  0.73474085],
       [-0.8363078 , -0.23830141,  0.3845348 , ...,  0.45564705,
        -0.6078618 ,  0.82788914],
       [-0.8897006 , -0.3659945 , -0.31427732, ..., -0.04487294,
        -0.6401185 ,  0.91602564]], dtype=float32)>

In [49]:
from sklearn.metrics import pairwise

pairwise.cosine_similarity([e[0]], [e[1]])

array([[0.9800086]], dtype=float32)

In [50]:
preprocessor = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

# BERT MODEL FOR BBC NEWS 

In [59]:
import tensorflow as tf

# bert model layer.....
bbc_input = tf.keras.layers.Input(shape=(), dtype=tf. , name='text')
pre_bert = preprocessor(bbc_input)
output = bert_encoder(pre_bert)

# neural network layer....
drop = tf.keras.layers.Dropout(0.1 , name='dropout')
# dense1 = tf.keras.layers.Dense(200, activation='relu' , name='dense1')
final = tf.keras.layers.Dense(5, activation='sigmoid', name='final_output')

# specifying chain of model.
a = drop(output['pooled_output'])
# b = dense1(a)
final_output = final(a)

# construct final output
bbc_bert_model = tf.keras.Model(inputs=[bbc_input] , outputs=[final_output])


In [60]:
bbc_bert_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_19 (KerasLayer)    {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [64]:
# COMPILLING THE MODEL...

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimize = tf.keras.optimizers.Adam(learning_rate=0.001)
METRIC = ['accuracy']
bbc_bert_model.compile(optimizer = optimize, loss = loss, metrics=METRIC )

In [134]:
import numpy as np
x = []
y = []

x = [data for data in X_train]
y = [data for data in y_train]


In [135]:
# TRAINING THE MODEL..... 
bbc_bert_model.fit(x, y, epochs=2)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'str\'>"})'}), (<class 'list'> containing values of types {"<class 'numpy.int32'>"})