In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import joblib

import datetime
import os
import numpy as np
import time
import multiprocessing as mp
import re 

In [2]:
import inspect, os
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
os.sys.path.insert(0,parentdir) 
from data_generation.diff_utils import clean_and_filter

# Build Models Models

In [3]:
from ngram import *
from baselines import *
from deep_learning import *
from ngram import *

Using TensorFlow backend.


#### Load Training Data

In [4]:
annotations = load_annotations()
all_annotations = pd.concat([annotations['user']['random'],
                            annotations['user']['blocked'],
                            annotations['article']['random'],
                            annotations['article']['blocked']])
del annotations

#### Build NGram Feature Extractor

In [5]:
n_features = 10000
feature_pipeline =  Pipeline([
    ('vect', CountVectorizer(ngram_range = (1,5), analyzer = 'char', max_features = n_features)),
    ('tfidf', TfidfTransformer(sublinear_tf=True,norm='l2')),
    ('to_dense', DenseTransformer()), 
])

comments = all_annotations.drop_duplicates(subset = ['rev_id'])['clean_diff']
preprocessor = feature_pipeline.fit(comments)
X = preprocessor.transform(comments)

#### Train Recipient Model

In [6]:
recipient_ed = empirical_dist(all_annotations['recipient'])
recipient_ed = recipient_ed.loc[comments.index].values

In [7]:
m_rec_ed =  KerasClassifier(build_fn=make_MLP,
                               output_dim = 2,
                               input_dim = n_features,
                               nb_epoch = 8,
                               batch_size = 100,
                               l = 0.0000001, 
                               layers = [],
                            )
m_rec_ed.fit(X, recipient_ed)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x1561c5f98>

In [8]:
m_rec_oh = LogisticRegression(C = 5.0,  solver = 'sag')\
        .fit(X, recipient_ed.argmax(axis = 1))

#### Train Aggression Model

In [9]:
aggression_ed = empirical_dist(all_annotations['aggression'])
aggression_ed = aggression_ed.loc[comments.index].values

In [10]:
m_agg_ed =  KerasClassifier(build_fn=make_MLP,
                               output_dim = 3,
                               input_dim = n_features,
                               nb_epoch = 4,
                               batch_size = 100,
                               l = 0.0000001, 
                               layers = []
                            )
m_agg_ed.fit(X, aggression_ed)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1562da9b0>

In [11]:
m_agg_oh = LogisticRegression(C = 5.0,  solver = 'sag')\
        .fit(X, aggression_ed.argmax(axis = 1))

In [12]:
del X
del all_annotations

#### Load annotationed diffs

In [13]:
def apply_models(df):
    diffs = df['clean_diff']
    features = preprocessor.transform(diffs)
    df = apply_aggression_model(df, features, m_agg_ed, '')
    df = apply_aggression_model(df, features, m_agg_oh, '_oh')
    df = apply_recipient_model(df, features, m_rec_ed, '')
    df = apply_recipient_model(df, features, m_rec_oh, '_oh')
    return df
       
def apply_aggression_model(df, features, model, suffix):
    agg_scores = model.predict_proba(features)
    df['pred_aggression_score%s' % suffix] = agg_scores.dot(np.array([1, 0, -1]))
    #df['pred_aggressive%s' % suffix] = agg_scores[:, 0]
    #df['pred_neutral%s' % suffix] = agg_scores[:, 1]
    #df['pred_friendly%s' % suffix] = agg_scores[:, 2]
    return df
    
def apply_recipient_model(df, features, model, suffix):
    rec_scores = model.predict_proba(features)
    df['pred_recipient_score%s' % suffix] = rec_scores[:,1]
    return df
    

In [14]:
annotations = load_annotations()

for ns in ['user', 'article']:

    d_annotations = annotations[ns]['random']
    
    d_annotations['aggression'] = (d_annotations['aggression'] -1) * -1

    d_annotated = d_annotations\
                .drop_duplicates(subset=['rev_id'])\
                .assign(
                    recipient = plurality(d_annotations['recipient'].dropna()),
                    recipient_score = average(d_annotations['recipient'].dropna()),
                    aggression = plurality(d_annotations['aggression'].dropna()),
                    aggression_score = average(d_annotations['aggression'].dropna()))

    d_annotated.to_csv('../../data/samples/%s/clean/d_annotated.tsv' % ns, sep = '\t')
del annotations

#### Load samples and apply models

We take various diffs datasets from hive, apply the clean and filter function and the score the clean diffs using the models.

In [15]:
def pred_helper(df):
    if len(df) == 0:
        return None
    
    return df.assign(rev_timestamp = lambda x: pd.to_datetime(x.rev_timestamp),
                     clean_diff = lambda x: x['clean_diff'].astype(str))\
             .pipe(apply_models)

    
def prep_in_parallel(path, k = 8):
    df = pd.read_csv(path, sep = '\t', encoding = 'utf-8')
    m = df.shape[0] 
    n_groups = int(m / 10000.0)
    df['key'] = np.random.randint(0, high=n_groups, size=m)
    dfs = [e[1] for e in df.groupby('key')]
    dfs = [pred_helper(d) for d in dfs]
    #p = mp.Pool(k)
    #dfs = p.map(pred_helper, dfs)
    #p.close()
    #p.join()
    return pd.concat(dfs)

In [16]:
base = '../../data/samples/'
nss = ['user', 'article']
samples = ['d_annotated.tsv', 'talk_diff_no_admin_sample.tsv', 'talk_diff_no_admin_2015.tsv', 'all_blocked_user.tsv']

base_cols = ['rev_id', 'clean_diff', 'rev_timestamp', 'pred_aggression_score','pred_aggression_score_oh', 'pred_recipient_score', 'pred_recipient_score_oh', 'page_title', 'user_text','user_id']
extra_cols = ['recipient', 'recipient_score', 'aggression', 'aggression_score']

for ns in nss:
    for s in samples:
        inf = os.path.join(base, ns, 'clean', s)
        print(inf)
        outf = os.path.join(base, ns, 'scored', s)
        if s == 'd_annotated.tsv':
            cols = base_cols + extra_cols
        else:
            cols = base_cols
        prep_in_parallel(inf, k = 4)[cols].to_csv(outf, sep = '\t', index = False)

../../data/samples/user/clean/d_annotated.tsv
../../data/samples/user/clean/talk_diff_no_admin_sample.tsv
../../data/samples/user/clean/all_blocked_user.tsv
../../data/samples/article/clean/d_annotated.tsv
../../data/samples/article/clean/talk_diff_no_admin_sample.tsv