In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def print_metrics(y_test, predictions):
  print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
  print('Precision score: {}'.format(precision_score(y_test, predictions)))
  print('Recall score: {}'.format(recall_score(y_test, predictions)))
  print('F1 score: {}'.format(f1_score(y_test, predictions)))

## Load Data

In [None]:
pollutor_tweets_df = pd.read_csv("/content/drive/Shareddrives/CSCI 5523 Group Project/Data/content_polluters_tweets_filtered.csv", header = 0)
legitimate_users_df = pd.read_csv("/content/drive/Shareddrives/CSCI 5523 Group Project/Data/legitimate_users_tweets_filtered.csv", header = 0)
dataset = legitimate_users_df.append(pollutor_tweets_df)

In [None]:
def write_df_to_dir(df, dir):
  for i in tqdm(range(df.shape[0])):
    with open(dir+"/"+str(df.iloc[i]["tweet_id"])+".txt", "w+") as f:
      f.write(df.iloc[i]["tweet"])
      f.close()

# write_df_to_dir(legitimate_users_df, "legit")

In [None]:
dataset.head()

Unnamed: 0,user_id,tweet_id,tweet,created_at,label
0,614,5873834688,I wish I had more free time. I'd LOVE to see you!,2009-11-19 18:16:40,0
1,614,5873809295,"Tonight, tomorrow. On the plane at 5 pm.",2009-11-19 18:15:42,0
2,614,5291252160,"I'm at Carlucci's in Salt Lake City, UT http:/...",2009-10-30 11:24:52,0
3,614,5205651441,@spam @JannetteDavid,2009-10-27 12:17:35,0
4,1038,5762418891,@dialupkid Mijn vriendin en ik hebben een geza...,2009-11-16 05:08:29,0


## Data Preprocessing

Remove duplicates

In [None]:
dataset.dropna(inplace=True)
dataset = dataset[dataset.tweet != '']
dataset.describe()

Unnamed: 0,user_id,tweet_id,label
count,4560727.0,4560727.0,4560727.0
mean,58215010.0,7809418000.0,0.4713784
std,38294530.0,3871802000.0,0.4991802
min,614.0,5218033.0,0.0
25%,24745340.0,5585464000.0,0.0
50%,49414590.0,5936497000.0,0.0
75%,84344060.0,9117617000.0,1.0
max,173767000.0,20145990000.0,1.0


Drop null values, only consider 20,000 random tweets as a precaution for performance. Hughdan will adjust this later.

In [None]:
#dataset_sample = dataset.sample(frac=1, random_state=1).head(100000*5)
dataset_sample = dataset.sample(frac=1, random_state=1)
dataset_sample.head()

Unnamed: 0,user_id,tweet_id,tweet,created_at,label
842049,26056810,5693671561,Closet Doors (Eastmoreland Heights): Closet do...,2009-11-13 16:54:34,0
1503267,103892446,9516790689,"Tessa Virtue, Scott Moir win Canada's first ic...",2010-02-23 01:44:15,1
1246729,37526413,5914025144,"Build failure for libarchive (u=0, c=0, b=50, ...",2009-11-21 01:51:01,0
1925994,63472113,5759543363,msh ngantukkk,2009-11-16 01:21:12,0
536452,38032306,13863317186,New Video-Mentions San Diego-Today on Good Mor...,2010-05-12 12:46:42,1


Remove Punctuation

In [None]:
import re

dataset_sample['tweet'] = dataset_sample['tweet'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
dataset_sample.dropna(inplace=True)
dataset_sample = dataset_sample[dataset_sample.tweet != '']
dataset_sample.describe()

Unnamed: 0,user_id,tweet_id,label
count,4559301.0,4559301.0,4559301.0
mean,58215440.0,7809652000.0,0.4714523
std,38296470.0,3871788000.0,0.4991844
min,614.0,5218033.0,0.0
25%,24741970.0,5585527000.0,0.0
50%,49414590.0,5936591000.0,0.0
75%,84348700.0,9118277000.0,1.0
max,173767000.0,20145990000.0,1.0


Convert to lowercase

In [None]:
# dataset_sample['tweet'] = dataset_sample['tweet'].apply(lambda x: x.lower())

# Create 2-grams

In [None]:
# words = []
# for ii in range(0,len(dataset_sample)):
#     words.append(str(dataset_sample.iloc[ii]['tweet']).split(" "))

# n_gram_all = []

# for word in words:
#     # get n-grams for the instance
#     n_gram = []
#     for i in range(len(word)-2+1):
#         n_gram.append("".join(word[i:i+2]))
#     n_gram_all.append(n_gram)
    
# n_gram_all[0][:10]


# Vectorizing with Hashing Vectorizer

In [None]:
# from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer

# # hash vectorizer instance
# #hvec = HashingVectorizer(lowercase=False, analyzer=lambda l:l, n_features=2**12, alternate_sign= False)

# # features matrix X
# #X = hvec.fit_transform(n_gram_all)

# # alternative
# hvec = HashingVectorizer(lowercase=False, n_features=2**13, alternate_sign= False, stop_words='english', norm = None, ngram_range=(1,2))
# X = hvec.fit_transform(dataset_sample['tweet'])

# print(X[0])

## Alternative tokenization using CountVectorizer()

In [None]:
# ## Count Vectorizer 

# cvec = CountVectorizer(ngram_range=(1,2), stop_words = 'english', max_features=2**12)

# # Fit the data and then return the matrix
# X = cvec.fit_transform(dataset_sample['tweet'])

##TFIDF 

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_features=2**12)

# X = tfidf.fit_transform(dataset_sample['tweet'])

In [None]:
# tfidf.vocabulary_

## Split data into training and testing splits

In [None]:
# from sklearn.model_selection import train_test_split

# # test set size of 20% of the data and the random seed 1
# X_train, X_test, y_train, y_test = train_test_split(X.toarray(), dataset_sample['label'], test_size=0.2, random_state=1)

## Multinomial Naive Bayes Classifier

Baseline Implementation - Train/Test Split

In [None]:
# from sklearn.naive_bayes import GaussianNB, MultinomialNB
# naive_bayes = MultinomialNB()
# naive_bayes.fit(X_train, y_train)

In [None]:
# predictions = naive_bayes.predict(X_test)

In [None]:
# print_metrics(y_test, predictions)

Baseline Implementation - 5-fold CV

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVC

Optimization

In [None]:
# parameters = {'kernel':('linear',), 'C':[1]}
# svc = SVC()
# clf = GridSearchCV(svc, parameters)
# clf.fit(X_train, y_train)

## Decision Tree Classifier

Baseline Implementation

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# tree = DecisionTreeClassifier(random_state=17)

Optimization

In [None]:
# tree_params = {'max_depth': range(4,11),
#                'max_features': range(6,19)}

# tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)

# tree_grid.fit(X_train, y_train)

# print(tree_grid.best_params_, tree_grid.best_score_)
# print_metrics(y_test, pred)

## kNN Classifier

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# #knn = KNeighborsClassifier(n_neighbors=5)frac=1

## Finding optimal neighbors with 5-fold CV

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

# knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])

# knn_params = {'knn__n_neighbors': range(1, 2)}

# knn_grid = GridSearchCV(knn_pipe, knn_params,
#                         cv=5, n_jobs=-1, verbose=True)

# knn_grid.fit(X_train, y_train)

# knn_grid.best_params_, knn_grid.best_score_

In [None]:
# pred = knn_grid.predict(X_test)
# print_metrics(y_test, pred)

#RNN

In [None]:
import tensorflow as tf


In [None]:
tfd = tf.data.Dataset.from_tensor_slices((dataset_sample["tweet"], dataset_sample["label"]))
# tfd.shard()
# tfd.shuffle(2*len(tfd)) ##** Why this not work
# tfd.shuffle(2*len(tfd))
# tfd.shuffle(2*len(tfd))
# tfd.shuffle()

# tfd = tfd.shuffle(10000)

BUFFER_SIZE = 10000
BATCH_SIZE = 512

train_dataset = tfd.take(int(len(tfd)*.8)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
#train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = tfd.skip(int(len(tfd)*.8)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
#test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
# for example, label in train_dataset.take(1):
#   print('text: ', example.numpy())
#   print('label: ', label.numpy())


In [None]:
# VOCAB_SIZE = 2**10
# encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
#     max_tokens=VOCAB_SIZE)
# encoder.adapt(tfd.map(lambda text, label: text))

In [None]:
# vocab = np.array(encoder.get_vocabulary())
# vocab[:20]

In [None]:
# model = tf.keras.Sequential([
#     encoder,
#     tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# # create a model checkpoint for saving model with highest accuracy
filepath = "model"

earlystopper = EarlyStopping(monitor='val_accuracy', mode='max',patience=10, verbose=1)

checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy',verbose=1, 
                             save_best_only=True, mode='max')

callbacks_list = [earlystopper, checkpoint]


In [None]:
# model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               optimizer=tf.keras.optimizers.Adam(1e-4),
#               metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [None]:
# history = model.fit(train_dataset, epochs=50,
#                     validation_data=test_dataset,
#                     validation_steps=30, callbacks=callbacks_list)

In [None]:
# tf.keras.models.save_model(model, "model.h5")

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
!pip install tensorflow_text
import tensorflow_text as text
!pip install -q tf-models-official
from official.nlp import optimization  # to create AdamW optmizer

import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')



In [None]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_mask', 'input_type_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.76262873  0.9928097  -0.18611881  0.36673862  0.15233737  0.6550447
  0.9681153  -0.9486272   0.00216161 -0.9877732   0.06842697 -0.9763058 ]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946346  0.34321272  0.33231515 ...  0.21300808  0.71020764
  -0.05771098]
 [-0.28742087  0.31980985 -0.23018607 ...  0.58455014 -0.21329728
   0.72692114]
 [-0.6615697   0.6887685  -0.87432986 ...  0.10877222 -0.2617324
   0.4785539 ]
 ...
 [-0.22561133 -0.2892562  -0.07064445 ...  0.47565985  0.83277136
   0.40025374]
 [-0.29824236 -0.27473187 -0.05450562 ...  0.48849723  1.0955356
   0.1816333 ]
 [-0.4437813   0.00930756  0.07223685 ...  0.17290069  1.1833248
   0.07897997]]


In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation="sigmoid", name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.69455534]], shape=(1, 1), dtype=float32)


In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_dataset,
                               validation_data=test_dataset,
                               epochs=epochs,
                               callbacks=callbacks_list)

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/5


ResourceExhaustedError: ignored

In [None]:
classifier_model.save("bert.h5", include_optimizer=False)


In [None]:
# classifier_model.evaluate(test_dataset, verbose=2)

In [None]:
# from sklearn.metrics import classification_report

# # ypreds=[]
# ypredsbool=np.array([])
# ytrue=np.array([])
# for i in range(len([item[0] for item in test_dataset.take(-1)])-20):
#   y_pred = classifier_model.predict([item[0] for item in test_dataset.take(-1)][i], batch_size=64, verbose=1)
#   y_pred_bool = np.argmax(y_pred, axis=1)

#   # ypreds=ypreds.append(y_pred)
#   ypredsbool = np.concatenate((ypredsbool, y_pred_bool))
#   ytrue = np.concatenate((ytrue, [item[1] for item in test_dataset.take()][i]))

# print(classification_report(ytrue, ypredsbool))

In [None]:
# # from sklearn.metrics import classification_report

# y_pred = classifier_model.predict(test_dataset, verbose=1)
# y_pred_bool = np.argmax(y_pred, axis=1)
# # y_pred_bool

# # ypreds=ypreds.append(y_pred)
# #   ypredsbool = np.concatenate((ypredsbool, y_pred_bool))
# ytrue = [item[1] for item in test_dataset.as_numpy_iterator()]
# ytrue = np.array([])
# for i in test_dataset.as_numpy_iterator():
#   # print(i[1].shape)
#   ytrue = np.concatenate((ytrue, i[1]))

# ytrue.shape

# print(classification_report(ytrue, ypredsbool))

In [None]:
len([item[1] for item in test_dataset.as_numpy_iterator()])

In [None]:
print(i[1].shape)

In [None]:
np.concatenate((ypredsbool, y_pred_bool))

In [None]:
# ytrue.shape
ypredsbool.shape

In [None]:
classifier_model.save("bert")