In [0]:
!pip install tensorflow-gpu==2.0
!pip install bert-tensorflow
!pip install --upgrade bert
!pip install transformers

In [1]:
import tensorflow as tf
tf.gfile=tf.io.gfile
tf.test.gpu_device_name()

'/device:GPU:0'

In [4]:
from google.colab import files
files.upload()
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle datasets download -d uciml/sms-spam-collection-dataset
!unzip sms-spam-collection-dataset.zip

Saving kaggle.json to kaggle.json
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 79.7MB/s]
Archive:  sms-spam-collection-dataset.zip
  inflating: spam.csv                


In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
nltk.download('punkt')
nltk.download('stopwords')
tokenizer = RegexpTokenizer(r'\w+')
tqdm.pandas()

MAX_LEN = 128
def make_clean(s) :
  for i in range(10) :
    s = s.replace(str(i), ' ')
  tokens = np.array(tokenizer.tokenize(s.lower()))
  tokens = tokens[~np.isin(tokens, stopwords.words())]
  return ' '.join(tokens)

df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
df['clean'] = df['v2'].progress_apply(make_clean)
df['label'] = (df['v1']=='ham').astype(int)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


HBox(children=(IntProgress(value=0, max=5572), HTML(value='')))




In [3]:
from transformers import BertTokenizer
btokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

MAX_LEN = 128
input_ids = []
attention_masks = []

for sent in tqdm(df['clean']):
    encoded_dict = btokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = MAX_LEN,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

df['input_ids'] = input_ids
df['attention_masks'] = attention_masks

HBox(children=(IntProgress(value=0, max=5572), HTML(value='')))




In [4]:
random_index = np.zeros(df.shape[0]).astype(bool)
random_index[np.random.choice(df.shape[0], int(0.2*df.shape[0]))] = True
train_df, test_df = df[~random_index], df[random_index]
train_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,clean,label,input_ids,attention_masks
0,ham,"Go until jurong point, crazy.. Available only ...",,,,go jurong point crazy available bugis great wo...,1,"[[tensor(101), tensor(2175), tensor(18414), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
1,ham,Ok lar... Joking wif u oni...,,,,ok lar joking wif,1,"[[tensor(101), tensor(7929), tensor(2474), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry wkly comp win fa cup final tkts st ...,0,"[[tensor(101), tensor(2489), tensor(4443), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
3,ham,U dun say so early hor... U c already then say...,,,,dun say early hor already say,1,"[[tensor(101), tensor(24654), tensor(2360), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,think goes usf lives around though,1,"[[tensor(101), tensor(2228), tensor(3632), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."


In [0]:
import numpy as np

# Create datasets (Only take up to MAX_LEN words)
train_ids = train_df['input_ids'].tolist()
train_ids = [t.tolist()[0:MAX_LEN] for t in train_ids]
train_ids = np.array(train_ids, dtype=int)[:, np.newaxis].reshape(train_df.shape[0],-1)
train_masks = train_df['attention_masks'].tolist()
train_masks = [t.tolist()[0:MAX_LEN] for t in train_masks]
train_masks = np.array(train_masks, dtype=int)[:, np.newaxis].reshape(train_df.shape[0],-1)
train_label = train_df['label'].tolist()

test_ids = test_df['input_ids'].tolist()
test_ids = [t.tolist()[0:MAX_LEN] for t in test_ids]
test_ids = np.array(test_ids, dtype=int)[:, np.newaxis].reshape(test_df.shape[0],-1)
test_masks = test_df['attention_masks'].tolist()
test_masks = [t.tolist()[0:MAX_LEN] for t in test_masks]
test_masks = np.array(test_masks, dtype=int)[:, np.newaxis].reshape(test_df.shape[0],-1)
test_label = test_df['label'].tolist()

In [0]:
from tensorflow.keras.models import Model
import tensorflow_hub as hub
import bert
from bert import tokenization
from bert.tokenization import FullTokenizer
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from transformers import *

In [13]:
input_word_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/1", trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
dense = layers.Dense(1024, activation='relu')(pooled_output)
# bert_layer = TFBertModel.from_pretrained("bert-base-uncased")
# sequence_output, pooled_output = bert_layer([input_word_ids, input_mask, segment_ids])
# dense = layers.Dense(1024, activation='relu')(sequence_output)
# rshp = layers.GlobalMaxPooling1D()(dense)
pred = layers.Dense(1, activation='sigmoid')(rshp)
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=2e-4), metrics=['accuracy'])
model.summary()

HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=536063208, style=ProgressStyle(description_…


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 109482240   input_word_ids[0][0]             
                                                                 input_mask[0][0]          

In [0]:
from keras.callbacks import Callback
import numpy as np
import sklearn.metrics as sklm

class Metrics(Callback):
    def __init__(self, val_data, batch_size = 32) :
        super().__init__()
        self.validation_data = val_data
        self.batch_size = batch_size

    def on_train_begin(self, logs={}):
        self.loss = []
        self.precision = []
        self.recall = []
        self.f1s = []
        self.accuracy = []
        self.auc = []

    def on_epoch_end(self, epoch, logs={}):
        score = np.asarray(self.model.predict(self.validation_data[0]))
        predict = np.squeeze(score.round()).reshape(-1)
        targ = self.validation_data[1]
        self.loss.append(logs['val_loss'])
        self.auc.append(sklm.roc_auc_score(targ, score))
        self.precision.append(sklm.precision_score(targ, predict))
        self.recall.append(sklm.recall_score(targ, predict))
        self.f1s.append(sklm.f1_score(targ, predict))
        self.accuracy.append(sklm.accuracy_score(targ, predict))

        pd.DataFrame({
            'loss': self.loss,
            'precision': self.precision,
            'recall': self.recall,
            'f1s': self.f1s,
            'accuracy': self.accuracy,
            'auc': self.auc
        }).to_csv('recors.csv')

        return

In [15]:
def make_as_input(ids, masks) :
    return [
        ids.astype(np.int32),
        masks.astype(np.int32),
        np.zeros(masks.shape, dtype=np.int32)
    ]

batch_size = 4
val_data = (make_as_input(test_ids, test_masks), np.array(test_label))
metrics = Metrics(val_data=val_data, batch_size=batch_size)

model.fit(make_as_input(train_ids, train_masks),
          np.array(train_label),
          validation_data=val_data,
          epochs=50,
          callbacks=[metrics],
          batch_size=batch_size)

Train on 4557 samples, validate on 1015 samples
Epoch 1/50








Epoch 2/50
Epoch 3/50
 104/4557 [..............................] - ETA: 3:48 - loss: 0.3186 - accuracy: 0.9000

KeyboardInterrupt: ignored