# Experimenting FastText

In [85]:
import sys,os
sys.path.append('ClonedModel/wmModel/wiki-detox/src/modeling/')

from sklearn.pipeline import Pipeline
from sklearn.grid_search import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import joblib
import copy
import pandas as pd

In [2]:
# Some Helper Functions with WikiMedia
import numpy as np

def empirical_dist(l, w = 0.0, index = None):
    """
    Compute empirical distribution over all classes
    using all labels with the same rev_id
    """
    if not index:
        index = sorted(list(set(l.dropna().values)))

    data = {}
    for k, g in l.groupby(l.index):
        data[k] = g.value_counts().reindex(index).fillna(0) + w

    labels = pd.DataFrame(data).T
    labels = labels.fillna(0)
    labels = labels.div(labels.sum(axis=1), axis=0)
    return labels

def plurality(l):
    """
    Take the most common label from all labels with the same rev_id.
    
    Return:
    =======
    s = an array of integers of 0 or 1
    """
    s = l.groupby(l.index).apply(lambda x:x.value_counts().index[0])
    s.name = 'y'
    return s

def one_hot(y):
    """
    Return:
    =======
    y_oh = an array of vectors (one-hot vectors)
    """
    m = y.shape[0]
    
    if len(y.shape) == 1:
        n = len(set(y.ravel()))
        idxs = y.astype(int)
    else:
        idxs = y.argmax(axis = 1)
        n = y.shape[1]

    y_oh = np.zeros((m, n))
    y_oh[list(range(m)), idxs] = 1
    return y_oh

def load_and_parse_training(data_dir, task, data_type):
    COMMENTS_FILE = "%s_annotated_comments.tsv" % task
    LABELS_FILE = "%s_annotations.tsv" % task
    comments = pd.read_csv(os.path.join(data_dir, COMMENTS_FILE), sep = '\t', index_col = 0)
    # remove special newline and tab tokens

    comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
    comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))


    annotations = pd.read_csv(os.path.join(data_dir, LABELS_FILE),  sep = '\t', index_col = 0)
    
    X = comments.sort_index()['comment'].values
    
    if(data_type == 'empirical'):
        labels = empirical_dist(annotations[task])
        y = labels.sort_index().values        
    elif(data_type == 'onehot'):
        y = plurality(annotations[task])
        
    assert(X.shape[0] == y.shape[0])    
    return X, y  
    



In [3]:
# Load dataset
DATA_DIR = 'TalkData/computed_dataset/'
task = 'attack'
%time [X,yEmp] = load_and_parse_training(DATA_DIR, task, 'empirical')
yOneHot = one_hot(yEmp)


# [_,yPlurality] = load_and_parse_training(DATA_DIR, task, 'onehot') will disagree with yOneHot when prob = 0.5 ()
# this occurs 0.869%

CPU times: user 1min 59s, sys: 460 ms, total: 2min
Wall time: 2min


## Make Dataset

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                    X, yOneHot, 
                                    test_size=0.15, 
                                    random_state=0
                                    )


DATA_TRAINING = pd.DataFrame([X_train,y_train]).T
DATA_TESTING = pd.DataFrame([X_test,y_test]).T

In [5]:
DATA_TRAINING.columns = ['Text','Category']
DATA_TRAINING['Category'] = DATA_TRAINING['Category'].apply(
                lambda y: 'notAttack' if y.argmax() == 0 else 'Attack')

DATA_TESTING.columns = ['Text','Category']
DATA_TESTING['Category'] = DATA_TESTING['Category'].apply(
                lambda y: 'notAttack' if y.argmax() == 0 else 'Attack')

In [11]:
TRAINING_DATA_dir = 'ATTACK_TRAINING.txt'
TESTING_DATA_dir = 'ATTACK_TESING.txt'


with open(TRAINING_DATA_dir, 'w') as f:
    for row in DATA_TRAINING.iterrows():
        text = row[1]['Text']
        category = row[1]['Category']
        f.write('{}. __label__{}\n'.format(text, category))
with open(TESTING_DATA_dir, 'w') as f:
    for row in DATA_TESTING.iterrows():
        text = row[1]['Text']
        category = row[1]['Category']
        f.write('{}. __label__{}\n'.format(text, category))

# make sure they are different (reduce human error)
with open(TRAINING_DATA_dir, 'r') as f:
    training = f.readlines()
with open(TESTING_DATA_dir, 'r') as f:
    testing = f.readlines()
    
assert len(training) != len(testing)

# Training and Evaluating FastText

In [83]:
# Training
import subprocess
FASTTEXT_dir = 'ClonedModel/fastText/'
assert(os.path.exists(FASTTEXT_dir))

OUTPUT_MODEL_dir = 'model'
subprocess.call("sudo {}./fasttext supervised -input {} -output {} -epoch 2".format(FASTTEXT_dir,
                                                                          TRAINING_DATA_dir,
                                                                          OUTPUT_MODEL_dir),
               shell=True)

0

In [84]:
# Testing
PRED_FILE_dir = 'PREDICTED.txt'
subprocess.call("sudo {}./fasttext predict {}.bin {} > {}".format(FASTTEXT_dir,
                                                        OUTPUT_MODEL_dir,
                                                        TESTING_DATA_dir,
                                                        PRED_FILE_dir),
               shell=True)
subprocess.call("sudo {}./fasttext test {}.bin {}".format(FASTTEXT_dir,
                                                        OUTPUT_MODEL_dir,
                                                        TESTING_DATA_dir),
               shell=True)

0

In [89]:
# Using the roc_scorer and spearman_scorer from WikiMedia
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score

def expectation(y):
    classes = np.arange(y.shape[1])
    return y.dot(classes)

def multi_class_roc_auc(true, pred, average = 'macro'):
    true = one_hot(true)
    #print(true)
    return roc_auc_score(true, pred, average = average)

def multi_class_spearman(true, pred):
    return spearmanr(expectation(true), expectation(pred))


In [90]:
with open(PRED_FILE_dir, 'r') as f:
    preds = f.readlines()
preds = [p.strip().strip('__label__') for p in preds]
preds = [[1, 0] if p == 'notAttack' else [0, 1] for p in preds]
preds = np.array(preds)

In [91]:

spearman = multi_class_spearman(y_test, preds)
roc = multi_class_roc_auc(y_test, preds)
print("ROC:\n \t{}".format(roc))
print("SPEARMAN:\n \t{}".format(spearman))

ROC:
 	0.7772895349049003
SPEARMAN:
 	SpearmanrResult(correlation=0.65849963956608937, pvalue=0.0)


# Conclusions  
**FastText** performs worse on this dataset. The spearman score is similar, but the roc_auc is significantly worse. I suspect this is due to fact that the because the predictions are categorical not real-valued. Still, the use of FastText have no significant value except being extremely fast (way less than 1s using FastText, a few seconds using charCNN)