In [None]:
!pip -q install keras_bert
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip -o uncased_L-12_H-768_A-12.zip

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
path_dict = {}

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path_dict[filename] = os.path.join(dirname, filename)

# Any results you write to the current directory are saved as output.
path_dict

In [None]:
train_df = pd.read_csv(path_dict['train.csv'])
test_df = pd.read_csv(path_dict['test.csv'])
train_df.shape, test_df.shape, train_df.columns

In [None]:
pred_df = test_df.copy()

train_df['Review Title'] = train_df['Review Title'] + ' '
train_df['text'] = 3*train_df['Review Title'] + train_df['Review Text']

test_df['Review Title'] = test_df['Review Title'] + ' '
test_df['text'] = 3*test_df['Review Title'] + test_df['Review Text']

In [None]:
train_multi_label = train_df.groupby('text')['topic'].apply(lambda x: '|'.join(x)).reset_index()
print('Total combinations of multi-labels in the train using the 21 classes:', 
      len(train_multi_label['topic'].value_counts().to_dict()))

In [None]:
test_multi_label = test_df.groupby('text').count().reset_index()
test_multi_label['review_count'] = test_multi_label['Review Text']
test_multi_label = test_multi_label[['text', 'review_count']]

In [None]:
mdf = test_df.merge(test_multi_label[['text', 'review_count']], how='left', on = 'text')
mdf.shape

In [None]:
train_multi_label.shape, test_multi_label.shape

In [None]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

def preprocess_text(text):
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))

    sent = decontracted(text)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9:]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    
    words = sent.split(' ')
    words = [word.lower() for word in words]
    sent = ' '.join(e for e in words if e not in stopwords)
    return sent.lower().strip()

train_multi_label['text'] = train_multi_label['text'].progress_apply(lambda x: preprocess_text(x))
test_ml_text = test_multi_label['text'].progress_apply(lambda x: preprocess_text(x))

In [None]:
X_train_val = train_multi_label[['text']].copy()
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
lb = MultiLabelBinarizer()
y_train_val = lb.fit_transform(train_multi_label['topic'].apply(lambda x:x.split('|')))
num_classes = len(lb.classes_)
num_classes, X_train_val.shape, y_train_val.shape

In [None]:
num_feat = []
cat_feat = []
text_feat = ['text']

features = num_feat + cat_feat + text_feat
label = 'topic'

In [None]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

import codecs
from keras_bert import load_trained_model_from_checkpoint

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

In [None]:
# from sklearn.model_selection import train_test_split
# df_train, df_val, = train_test_split(train_df, stratify = train_df[label],
#                                     test_size = 0.1, random_state = 2019)
# df_train.shape, df_val.shape

In [None]:
from tqdm import tqdm
from keras_bert import Tokenizer
from keras.utils import to_categorical

tokenizer = Tokenizer(token_dict)

def load_data_from_df(X,y, maxlen = 256):
    global tokenizer
    indices, labels = [], []
    for text, label in zip(X['text'].tolist(), y):
        ids, segments = tokenizer.encode(text, max_len=maxlen)
        indices.append(ids)
        labels.append(label)
    items = list(zip(indices, labels))
    np.random.shuffle(items)
    indices, labels = zip(*items)
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)], y

In [None]:
Xtv, ytv = load_data_from_df(X_train_val, y_train_val)

In [None]:
# train_x, train_y = load_data_from_df(df_train)
# val_x, val_y = load_data_from_df(df_val)

# from sklearn.preprocessing import LabelEncoder,LabelBinarizer
# lb = LabelBinarizer()
# train_y = lb.fit_transform(train_y)
# val_y = lb.transform(val_y)
# num_classes = len(lb.classes_)

In [None]:
SEQ_LEN = 256
BATCH_SIZE = 16
EPOCHS = 9
LR = 1e-4

model = load_trained_model_from_checkpoint(config_path,checkpoint_path, training=True,
                                           trainable=True,seq_len=SEQ_LEN,)

In [None]:
# @title Build Custom Model
import keras
from keras_bert import AdamWarmup, calc_train_steps

inputs = model.inputs[:2]
dense = model.get_layer('NSP-Dense').output
outputs = keras.layers.Dense(units=num_classes, activation='sigmoid')(dense)

decay_steps, warmup_steps = calc_train_steps(y_train_val.shape[0],
                                             batch_size = 16,
                                             epochs = 9,)

model = keras.models.Model(inputs, outputs)
model.compile(AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR),
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'],)

In [None]:
# @title Initialize Variables
import tensorflow as tf
import keras.backend as K

sess = K.get_session()
uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
init_op = tf.variables_initializer(
    [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables]
)
sess.run(init_op)

In [None]:
model.fit(Xtv, ytv, epochs = 3, batch_size=16)

In [None]:
tokenizer = Tokenizer(token_dict)
maxlen = 256
indices = []
for text in test_ml_text.tolist():
    ids, segments = tokenizer.encode(text, max_len=maxlen)
    indices.append(ids)
indices = np.array(indices)
x_test = [indices, np.zeros_like(indices)]

In [None]:
y_test_probas = model.predict(x_test)
y_test_labels = lb.inverse_transform(y_test_probas > 0.1)
test_predictions = ['|'.join(pr) for pr in y_test_labels]
print(len(x_test), len(test_predictions))

In [None]:
top_6_predictions = []
for proba in y_test_probas:
    top_6_predictions.append(lb.classes_[np.argsort(proba)[-6:]])  # from back

In [None]:
test_row_preds = []
for count, prediction in zip(test_multi_label['review_count'].values, top_6_predictions):
    row_prediction = '|'.join(prediction[-count:])
    test_row_preds.append(row_prediction)

In [None]:
def get_preds(test_multi_label):
    df1 = test_multi_label.merge(test_multi_label.topics.str.split('|',expand=True),
                    left_index=True, right_index=True, how='outer')
    df1.drop('topics',axis=1,inplace=True)
    df2 = df1.melt(['text'], value_vars = [0, 1, 2, 3, 4, 5])
    res_df = df2[df2['value'].isin(lb.classes_)]
    return res_df

In [None]:
test_multi_label['topics'] = test_row_preds
res_df = get_preds(test_multi_label)
print(res_df.shape, res_df.columns)
res_df.head()

In [None]:
test_df.shape, res_df.shape

In [None]:
res_pred_df = test_df.merge(res_df[['text', 'value']], how='left', on = 'text')
res_pred_df.shape, res_pred_df.duplicated().sum()

In [None]:
res_pred_df = res_pred_df.drop_duplicates().reset_index(drop = 1)
res_pred_df.shape, test_df.shape

In [None]:
pred_df['topic'] = res_pred_df['value']
print(pred_df.shape, pred_df.columns)
pred_df['topic'].value_counts()

In [None]:
filename = '6-amazon-reviews-mlbert.csv'
print(len(test_df), len(pred_df))
pred_df.to_csv(filename, index = None)
from IPython.display import FileLink
FileLink(filename)

In [None]:
# # res_pred = res_pred_df[~(res_pred_df[['text', 'value']].duplicated())]
# # res_pred.shape
# pred_df['topic'] = res_pred['value']

In [None]:
# res_pred_df = res_pred_df['text','values']'.drop_duplicates().reset_index(drop = 1)
# res_pred_df.shape, test_df.shape

In [None]:
# from sklearn.metrics import accuracy_score, f1_score

# print("Accuarcy:", accuracy_score(val_y, y_val_pred>0.5))
# print("F1:", f1_score(val_y, (y_val_pred>0.5).astype(int), average = 'weighted'))

In [None]:
# y_test_pred = model.predict(test_x)
# test_predictions = lb.inverse_transform(y_test_pred)

# # for i in range(0, len(test_df)):
# #     test_predictions.append(np.argmax(y_test_pred[i]))

In [None]:
# file = '3-amazon_bert_base.csv'
# pred_df['topic'] = test_predictions
# pred_df.to_csv(file, index = None)
# from IPython.display import FileLink
# FileLink(file)

In [None]:
# pred_val_y = model.predict([val_X], batch_size=1024, verbose=1)
# for thresh in np.arange(0.1, 0.501, 0.01):
#     thresh = np.round(thresh, 2)
#     print("F1 score at threshold {0} is {1}".format(thresh, f1_score(val_y,(pred_val_y>thresh).astype(int))))