In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection
import joblib
import os


%matplotlib inline

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './ai-moderator-b18e81abdc4a.json'

#### Constants

In [None]:
RANDOM_SEED = 42
DICT_SIZE = 10000
TEST_SIZE = 10000
DEV_SIZE = 10000
PATH = '/communities/default/versions/bert/'

TRAIN = True
UPLOAD = True

## Preparations

TODO: Make pip python package `GoogleCloudStorageWrapper`

In [None]:
import base64
import hashlib
from os import path

from google.api_core import exceptions as g_exceptions
from google.cloud import storage as g_storage


class GoogleCloudStorageWrapper:
    @staticmethod
    def md5_base64(filename):
        """Returns md5 hash with base of 64"""
        hash_md5 = hashlib.md5()
        with open(filename, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        hash_md5_base64 = base64.b64encode(hash_md5.digest()).decode('utf-8')
        return hash_md5_base64

    @staticmethod
    def lazy_upload_blob(bucket_name, source_file_name, destination_blob_name):
        """Uploads a file to the bucket if it has different hash."""
        storage_client = g_storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(destination_blob_name)

        remote_blob = bucket.get_blob(destination_blob_name)

        local_md5 = GoogleCloudStorageWrapper.md5_base64(source_file_name)

        if remote_blob is not None:
            remote_md5 = remote_blob.md5_hash
            if remote_md5 == local_md5:
                print(f'Blob `{destination_blob_name} is '
                      f'already in bucket `{bucket_name}`')
                return

            print(f'Updating blob `{destination_blob_name}` in '
                  f'bucket `{bucket_name}` from `{source_file_name}`')

        blob.upload_from_filename(source_file_name)

        # check for integrity of uploaded file
        uploaded_blob = bucket.get_blob(destination_blob_name)
        uploaded_md5 = uploaded_blob.md5_hash
        if uploaded_md5 != local_md5:
            raise g_exceptions.DataLoss('Downloaded file differs from remote')

        print(f'File `{source_file_name}` successfully uploaded '
              f'to `{destination_blob_name}` of bucket `{bucket_name}`')

    @staticmethod
    def lazy_download_blob(bucket_name, source_blob_name, destination_file_name):
        """Downloads a blob from the bucket if the local version of file differs
        from the remote version (calculated using md5 hash)."""

        storage_client = g_storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(source_blob_name)
        remote_blob = bucket.get_blob(source_blob_name)

        remote_md5 = remote_blob.md5_hash

        if path.exists(destination_file_name):
            local_md5 = GoogleCloudStorageWrapper.md5_base64(destination_file_name)
            if remote_md5 == local_md5:
                print(f'Blob {source_blob_name} is already downloaded to {destination_file_name}')
                return

        blob.download_to_filename(destination_file_name)

        # check for integrity of downloaded file
        downloaded_md5 = GoogleCloudStorageWrapper.md5_base64(destination_file_name)
        if remote_md5 != downloaded_md5:
            raise g_exceptions.DataLoss('Downloaded file differs from remote')

        print(f'Blob {source_blob_name} successfully downloaded to {destination_file_name}')


Create temporary folder for files

In [None]:
if not os.path.exists('./temp-files/'):
    os.mkdir('./temp-files/')

Download dataset from google cloud.

In [None]:
GoogleCloudStorageWrapper.lazy_download_blob(bucket_name='communities-models',
                                             source_blob_name='/data/toxic-comment/core.csv',
                                             destination_file_name='./temp-files/core.csv')

Load dataset

In [None]:
core = pd.read_csv("./temp-files/core.csv")
core.loc[:, 'comment_text'] = core.loc[:, 'comment_text'].str.lower()
core.info()

In [None]:
core = core.assign(len=core['comment_text'].str.len())
core.head(2)

In [None]:
train, test = model_selection.train_test_split(core, test_size=(TEST_SIZE + DEV_SIZE), random_state=RANDOM_SEED,
                                               stratify=core.loc[:, 'base_class'])
dev, test = model_selection.train_test_split(test, test_size=TEST_SIZE, random_state=RANDOM_SEED,
                                             stratify=test.loc[:, 'base_class'])
print(len(train), len(test), len(dev))
train.head(3)

## Juicy part

## Model

initialization

In [None]:
if TRAIN:
    pass

fit (and save)

In [None]:
if TRAIN:
    training_successfully_finished = False
    # fit your model here
    training_successfully_finished = True

    if training_successfully_finished:
        # Save your model on local drive here:
        # Example:
        # joblib.dump(tokenizer, './temp-files/tokenizer.joblib')
        # model.save("./temp-files/model.h5", overwrite=True)
        pass

        if UPLOAD:
            # Upload your model to cloud here:
            # Example:
            # GoogleCloudStorageWrapper.lazy_upload_blob(bucket_name='communities-models',
            #                                            source_file_name='./temp-files/tokenizer.joblib',
            #                                            destination_blob_name=PATH + 'tokenizer.joblib')
            # GoogleCloudStorageWrapper.lazy_upload_blob(bucket_name='communities-models',
            #                                            source_file_name='./temp-files/model.h5',
            #                                            destination_blob_name=PATH + 'model.h5')
            pass

## Model evaluation

In [None]:
GoogleCloudStorageWrapper.lazy_download_blob(bucket_name='communities-models',
                                             source_blob_name=PATH + 'tokenizer.joblib',
                                             destination_file_name='./temp-files/tokenizer.joblib')
GoogleCloudStorageWrapper.lazy_download_blob(bucket_name='communities-models',
                                             source_blob_name=PATH + 'model.h5',
                                             destination_file_name='./temp-files/model.h5')

In [None]:
model = models.load_model("./temp-files/model.h5")
tokenizer = joblib.load('./temp-files/tokenizer.joblib')

In [None]:
from sklearn import metrics

In [None]:
dev_ready = sequence.pad_sequences(
    tokenizer.texts_to_sequences(dev['comment_text']), maxlen=MESSAGE_LEN_CHAR
)

In [None]:
predicted_probas = model.predict(dev_ready, verbose=1).squeeze()

In [None]:
predicted = (predicted_probas > 0.5).astype(int)
real = dev.loc[:, 'base_class']

##### Single number metrics

* Accuracy

In [None]:
acc = metrics.accuracy_score(predicted, real)
acc

* Balanced accuracy

In [None]:
b_acc = metrics.balanced_accuracy_score(predicted, real)
b_acc

* ROC-AUC

In [None]:
roc_auc = metrics.roc_auc_score(real, predicted_probas)
roc_auc

##### Confusion matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
confusion_matrix = metrics.confusion_matrix(real, predicted)
confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
# cm_df = pd.DataFrame(confusion_matrix, index=[], columns=[])
plt.figure(figsize=(8, 6))
_ = sns.heatmap(confusion_matrix, cmap='BuGn')
plt.ylabel('True label')
plt.xlabel('Predicted label')

##### Curves

* ROC

In [None]:
def plot_roc(fpr, tpr):
    plt.figure(figsize=(8, 6))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([-0.02, 1.0])
    plt.ylim([0.0, 1.02])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
fpr, tpr, thresholds_roc = metrics.roc_curve(real, predicted_probas)
plot_roc(fpr, tpr)

* precision recall curve

In [None]:
def plot_prc(precision, recall):
    plt.figure(figsize=(8, 6))
    lw = 2
    plt.plot(precision, recall, color='darkorange', lw=lw, label='PR curve')
    plt.xlim([0.0, 1.01])
    plt.ylim([0.0, 1.02])
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('Precision-recall curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
prec, rec, thresholds_prc = metrics.precision_recall_curve(real, predicted_probas)
plot_prc(prec, rec)