# Model from WikiMedia

In [1]:
import sys,os
sys.path.append('ClonedModel/wmModel/wiki-detox/src/modeling/')

import ngram
from baselines import load_comments_and_labels, assemble_data, one_hot
from deep_learning import make_mlp, DenseTransformer


from sklearn.pipeline import Pipeline
from sklearn.grid_search import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from keras.wrappers.scikit_learn import KerasClassifier
from serialization import save_pipeline, load_pipeline
import joblib
import copy
import pandas as pd

Using TensorFlow backend.


### Load Training Data

In [2]:
# Some Helper Functions with WikiMedia

def empirical_dist(l, w = 0.0, index = None):
    """
    Compute empirical distribution over all classes
    using all labels with the same rev_id
    """
    if not index:
        index = sorted(list(set(l.dropna().values)))

    data = {}
    for k, g in l.groupby(l.index):
        data[k] = g.value_counts().reindex(index).fillna(0) + w

    labels = pd.DataFrame(data).T
    labels = labels.fillna(0)
    labels = labels.div(labels.sum(axis=1), axis=0)
    return labels


def load_and_parse_training(data_dir, task):
    COMMENTS_FILE = "%s_annotated_comments.tsv" % task
    LABELS_FILE = "%s_annotations.tsv" % task
    comments = pd.read_csv(os.path.join(data_dir, COMMENTS_FILE), sep = '\t', index_col = 0)
    # remove special newline and tab tokens

    comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
    comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))


    annotations = pd.read_csv(os.path.join(data_dir, LABELS_FILE),  sep = '\t', index_col = 0)
    labels = empirical_dist(annotations[task])

    X = comments.sort_index()['comment'].values
    y = labels.sort_index().values

    assert(X.shape[0] == y.shape[0])
    return X, y

In [3]:
# Load dataset
DATA_DIR = 'TalkData/computed_dataset/'
task = 'attack'
%time [X,y] = load_and_parse_training(DATA_DIR, task)

CPU times: user 1min 59s, sys: 648 ms, total: 2min
Wall time: 1min 59s


### Load Best Hyper-Parameters from WikiMedia

In [4]:
# inputs needs to be changed
CV_RESULTS_DIR = 'ClonedModel/wmModel/wiki-detox/src/modeling/cv_results.csv'

In [5]:
import json
def load_best_params(cv_results_dir, model_type, ngram_type, label_type):
    '''
    Input:
    ======
    cv_result_dir: the directory to "cv_result" file of WikiMedia model
    '''
    
    
    cv_results = pd.read_csv(cv_results_dir)
    params = cv_results.query(" model_type == '%s' and ngram_type == '%s' and label_type == '%s'" % 
                                  (model_type, ngram_type, label_type))
    params = params.loc[:,'best_params'].iloc[0]
    return json.loads(params)


MODEL_TYPE = 'mlp'
NGRAM_TYPE = 'char'
LABEL_TYPE = 'ed'
best_params = load_best_params(CV_RESULTS_DIR, MODEL_TYPE, 
                               NGRAM_TYPE, LABEL_TYPE)


### Train the Model

In [6]:
# the MLP model was set by WikiMedia
from sklearn.model_selection import train_test_split
PIPELINE = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('to_dense', DenseTransformer()), 
    ('clf', KerasClassifier(build_fn=make_mlp, 
                            output_dim = 2, 
                            verbose=True)),
]) 
PIPELINE.set_params(**best_params)
# *args is used to pass a non-keyworded, variable length argument list
# **kargs is used to pass a keyworded, variable length argument list
X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.15, random_state=0)
%time PIPELINE.fit(X_train, y_train)

Epoch 1/2
Epoch 2/2
CPU times: user 3min 56s, sys: 17.4 s, total: 4min 14s
Wall time: 2min 58s


Pipeline(steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=[1, 5], preprocessor=None, stop_words=None,
        stri...t 0x7f970bf8e0b8>), ('clf', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7f970bf8e7b8>)])

### Evaluate the Model

In [7]:
# Using the roc_scorer and spearman_scorer from WikiMedia
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import numpy as np
import pandas as pd
from scipy.stats import pearsonr,spearmanr
from scipy.stats import entropy as kl
from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error

def expectation(y):
    classes = np.arange(y.shape[1])
    return y.dot(classes)

def multi_class_roc_auc(true, pred, average = 'macro'):
    true = one_hot(true)
    #print(true)
    return roc_auc_score(true, pred, average = average)

def multi_class_spearman(true, pred):
    return spearmanr(expectation(true), expectation(pred))

def roc_scorer(clf, X, y):
    pred = clf.predict_proba(X)
    return multi_class_roc_auc(y, pred)

def spearman_scorer(clf, X, y):
    pred = clf.predict_proba(X)
    return multi_class_spearman(y, pred) 

def eval_multiclass_classifier(model, X, y, 
                               plot = False, verbose = True):
    true_oh = one_hot(true)
    pred = model.predict_proba(X)

    if plot:
        multi_class_roc_plotter(true_oh, pred, plot = plot)
    else:
        roc = multi_class_roc_auc(true_oh, pred, average = 'macro')
        spearman = multi_class_spearman(true, pred)

        if verbose:
            print('\tROC: %.3f' % roc)
            print('\tSpearman: %.3f' % spearman)
        return roc, spearman

In [8]:
# Evaluate the model

%time spearman = spearman_scorer(PIPELINE, X_test, y_test)
print ("\n spearman: ", spearman)

%time roc = roc_scorer(PIPELINE, X_test, y_test)
print ("\n roc: ", roc)

Wall time: 24.5 s

 spearman:  SpearmanrResult(correlation=0.6684288488622987, pvalue=0.0)
Wall time: 24.7 s

 roc:  0.964303025205


**Explanation for Inconsistency with WikiMedia's Data**  
The models used and hyper-parameters are exactly the same  
But the train-test split are different

# Test Model

In [15]:
# prediction format
PIPELINE.predict_proba(X_test)



array([[ 0.93885863,  0.06114138],
       [ 0.91706741,  0.08293263],
       [ 0.983253  ,  0.01674701],
       ..., 
       [ 0.67994553,  0.3200545 ],
       [ 0.97207707,  0.027923  ],
       [ 0.87985152,  0.12014853]], dtype=float32)

In [21]:
# prediction format
one_hot(y_test)

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [59]:
sentence = 'Why the Spring break is so short???'
classes = ['notAttack', 'Attack']
prediction = PIPELINE.predict_proba([sentence])



In [60]:
print('Prediction for: \"{}\" \n\n{} \t{} \n{:.2f} \t\t{:.2f}'.format(sentence,
                                              classes[0], classes[1], 
                                              prediction[0,0], prediction[0,1]))

Prediction for: "Why the Spring break is so short???" 

notAttack 	Attack 
0.84 		0.16
