In [0]:
import os
import pandas as pd
import pprint
import tensorflow as tf
import numpy as np
from tensorflow import keras
from matplotlib import pyplot as plt

%matplotlib inline

In [0]:
print(tf.__version__)

# Setting up configuration for TPU

In [0]:
use_tpu = True #@param {type:"boolean"}

if use_tpu:
    assert 'COLAB_TPU_ADDR' in os.environ, 'Missing TPU; did you request a TPU in Notebook Settings?'

if 'COLAB_TPU_ADDR' in os.environ:
  TF_MASTER = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])
else:
  TF_MASTER=''

with tf.Session(TF_MASTER) as session:
  print ('List of devices:')
  pprint.pprint(session.list_devices())

# Mounting  google drive to access files from there

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')


In [0]:
import os
print(os.listdir("/content/gdrive/My Drive/"))

In [0]:
data_path = "/content/gdrive/My Drive/"



# Getting training and test files

In [0]:
import pandas as pd
train_file = pd.read_csv(os.path.join(data_path, "train_file.csv"))

In [0]:
test_file = pd.read_csv(os.path.join(data_path, "test_file.csv"))

In [0]:
def undersampling(train_file):
    zeroes = train_file[(train_file[['toxic', 'severe_toxic', 'obscene', 'threat',
                                     'insult', 'identity_hate']] == 0).all(axis=1)]
    no_zeroes = train_file[~train_file.isin(zeroes).all(1)]
    zeroes_filtered = zeroes.sample(n=16225, random_state=1)
    append_df = no_zeroes.append(zeroes_filtered)
    append_df.dropna(inplace=True)
    return append_df


In [0]:
# train_file = undersampling(train_file)


# Create output labels for each data set

In [0]:
def create_labels(data):
    # create a list of labels
    data = data.loc[:, 'toxic':'identity_hate']
    list_labels = data.values.tolist()
    list_labels = np.array(list_labels)
    return list_labels

In [0]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score

In [0]:
train_file.dropna(inplace=True)
test_file.dropna(inplace=True)
train_y = create_labels(train_file)
test_y = create_labels(test_file)
train_y = np.array(train_y)
test_y = np.array(test_y)

# Tokenize the contents in training and test sets


In [0]:
from keras.preprocessing.text import Tokenizer
content_train = train_file["comment_text"]
content_test = test_file["comment_text"]
max_features = 20000
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(content_train))
train_token = tok.texts_to_sequences(content_train)
test_token = tok.texts_to_sequences(content_test)

In [0]:
totalNumWords = [len(i) for i in train_token]
plt.hist(totalNumWords,bins = np.arange(0,450,10))
plt.xlabel("Word length")
plt.ylabel("Count")
plt.show()

# Make each sentences of equal length

In [0]:
from keras.preprocessing.sequence import pad_sequences
max_len = 150
train_x = pad_sequences(train_token, maxlen=max_len)
test_x = pad_sequences(test_token, maxlen=max_len)


# Create LSTM model

In [0]:
def model_lstm():
    inp = keras.Input(shape=(max_len, ))
    embedding = keras.layers.Embedding(max_features, 200)(inp)
    lstm = keras.layers.Bidirectional(keras.layers.LSTM(75, name = "LSTM_1", return_sequences = True, dropout = 0.1, recurrent_dropout= 0.1)) (embedding)
    max_pool = keras.layers.GlobalMaxPooling1D()(lstm)
    dropout1 = keras.layers.Dropout(0.1)(max_pool)
    dense  = keras.layers.Dense(50, name = "dense1")(dropout1)
    dropout2 = keras.layers.Dropout(0.1)(dense)
    leaky = keras.layers.LeakyReLU(alpha= 0.1)(dropout2)
    output = keras.layers.Dense(6, activation=tf.nn.sigmoid, name = "sigmoid")(leaky)
    model = keras.Model(inputs=inp, outputs=output)
    print(model.summary())
    return model


  

In [0]:


model = model_lstm()


# Compile the model

In [0]:
learning_rate = 0.01
loss = "binary_crossentropy"
chosen_opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
model.compile(loss=loss, optimizer=chosen_opt, metrics=["accuracy"])


# Convert keras model to a model compatible for tpu

In [0]:
import tensorflow as tf


tpu_model = tf.contrib.tpu.keras_to_tpu_model(
    model,
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
         tf.contrib.cluster_resolver.TPUClusterResolver(TF_MASTER)))

tpu_model.summary()

# Fit the model

In [0]:
def fit_model_sampled(model, x_train, y_train,num_batch_size):
    num_epochs = 5
    to_shuffle = True
    stop_early = True
    patience = 5
    val_split = 0.1
    print("\n-----FITTING THE MODEL-----")

    if stop_early:
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=patience, verbose=2)
    else:
        early_stop = None
    checkpoint = tf.keras.callbacks.ModelCheckpoint("best.h5", monitor='val_loss', save_best_only=True)


    print("Fitting the model:")
    history_object = model.fit(x_train, y_train, epochs=num_epochs,
                               batch_size=num_batch_size,
                               validation_split=val_split, shuffle=to_shuffle, callbacks=[early_stop, checkpoint])
    my_model = tf.keras.models.load_model("best.h5")

    # K.clear_session()
    plt.figure(figsize=(30, 15))
    plt.plot(history_object.history['loss'], color='g', label='loss')
    plt.plot(history_object.history['val_loss'], color='r', label='val_loss')
    plt.plot(history_object.history['acc'], color='y', label='acc')
    plt.plot(history_object.history['val_acc'], color='black', label='val_acc')
    plt.plot()
    plt.legend()
    plt.grid()
    plt.show()
    return my_model

In [0]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
  


In [0]:
fit = fit_model_sampled(tpu_model, train_x, train_y,320)

# Make predictions on new test sets

In [0]:
test_Y = test_y.copy()
test_X = test_x.copy()
test_X = test_X[:63896]
test_Y = test_Y[:63896]
predictions = fit.predict(test_X, batch_size = 2560)
prediction_zeroes = np.zeros_like(predictions)
prediction_zeroes[predictions >= 0.5] = 1

# Calculate the performance metrics

In [0]:
true_predictions = sum(sum(test_Y == prediction_zeroes))
accuracy = true_predictions / (test_Y.shape[0] * test_Y.shape[1])

In [0]:
f1_s = [f1_score(test_Y[:, i], prediction_zeroes[:, i]) for i in range(len(prediction_zeroes[0]))]
average_f1 = sum(f1_s) / len(f1_s)  

In [0]:
aucs = [roc_auc_score(test_Y[:, i], prediction_zeroes[:, i]) for i in range(len(prediction_zeroes[0]))]
average_aucsy = sum(aucs) / len(aucs)

In [0]:
average_aucsy

In [0]:
prediction_zeroes

In [0]:
average_f1

In [0]:
aucs = roc_auc_score(test_Y, predictions)


In [0]:
aucs

In [0]:
import pickle
with open("/content/gdrive/My Drive/predictions.p", "wb") as output_file:
    pickle.dump(predictions, output_file)