In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime

from ngram import tune, roc_scorer,spearman_scorer
from baselines import load_comments_and_labels, assemble_data, one_hot
from deep_learning import make_mlp, DenseTransformer
from deep_learning import make_lstm, make_conv_lstm, SequenceTransformer


from sklearn.pipeline import Pipeline
from sklearn.grid_search import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from keras.wrappers.scikit_learn import KerasClassifier
from serialization import save_pipeline, load_pipeline
import joblib
import copy
import pandas as pd

Using TensorFlow backend.


In [2]:
import keras
keras.__version__

'1.1.0'

time: 11.8 ms


### Helpers

In [3]:
def get_best_estimator(cv):
    params = cv.best_params_
    model = cv.estimator
    model = model.set_params(**params)
    return model
    
def save_best_estimator(cv, directory, name):
    model = get_best_estimator(cv)
    save_pipeline(model, directory, name)

time: 8.31 ms


### Load Annotated Data

In [4]:
task = 'attack'
data = load_comments_and_labels(task)

time: 5min 1s


### Params

In [5]:
path = '../../models/cv/'
n_max = 10000000
n_iter = 15

time: 2 ms


### Prep Data

In [6]:
X_train, y_train_ohv = assemble_data(data, 'comments', 'plurality', splits = ['train'])
X_dev, y_dev_ohv = assemble_data(data,  'comments', 'plurality', splits = ['dev'])

_, y_train_ed = assemble_data(data, 'comments', 'empirical_dist', splits = ['train'])
_, y_dev_ed = assemble_data(data,  'comments', 'empirical_dist', splits = ['dev'])

y_train_ohm = one_hot(y_train_ed)
y_dev_ohm = one_hot(y_dev_ed)

X_train = X_train[:n_max]
X_dev = X_dev[:n_max]

y_train_ohv = y_train_ohv[:n_max]
y_dev_ohv = y_dev_ohv[:n_max]

y_train_ed = y_train_ed[:n_max]
y_dev_ed = y_dev_ed[:n_max]

y_train_ohm = y_train_ohm[:n_max]
y_dev_ohm = y_dev_ohm[:n_max]

time: 165 ms


In [7]:
results_list = []

time: 1.86 ms


# Sklearn Experiments

Lets run some quick experiments in sklearn, so that we have baselines for the following models built in keras. We will only be building logistic regressions with one-hot labels. This will also help us see if we should use tfidf weighting and normalization.

In [8]:
max_features = (5000, 10000, 50000, 100000)
C = (0.0001, 0.001, 0.01, 0.1, 1, 10)

time: 2.87 ms


### No tfidf

In [9]:
alg = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

time: 2.5 ms


In [10]:
# linear char-gram, no tfidf

param_grid = {
    'vect__max_features': max_features, 
    'vect__ngram_range': ((1,5),),  
    'vect__analyzer' : ('char',),
    'clf__C' : C,
}

m = tune (X_train, y_train_ohv, X_dev, y_dev_ohv, alg, param_grid, n_iter, roc_scorer, n_jobs = 6, verbose = True)


Best parameters set found:
{'clf__C': 10, 'vect__analyzer': 'char', 'vect__max_features': 100000, 'vect__ngram_range': (1, 5)} 0.916608436706


Grid scores:
0.90034 (+/-0.00000) for {'clf__C': 0.1, 'vect__analyzer': 'char', 'vect__max_features': 5000, 'vect__ngram_range': (1, 5)}
0.91030 (+/-0.00000) for {'clf__C': 0.1, 'vect__analyzer': 'char', 'vect__max_features': 100000, 'vect__ngram_range': (1, 5)}
0.89931 (+/-0.00000) for {'clf__C': 10, 'vect__analyzer': 'char', 'vect__max_features': 10000, 'vect__ngram_range': (1, 5)}
0.87261 (+/-0.00000) for {'clf__C': 0.0001, 'vect__analyzer': 'char', 'vect__max_features': 5000, 'vect__ngram_range': (1, 5)}
0.88324 (+/-0.00000) for {'clf__C': 0.0001, 'vect__analyzer': 'char', 'vect__max_features': 100000, 'vect__ngram_range': (1, 5)}
0.91456 (+/-0.00000) for {'clf__C': 0.001, 'vect__analyzer': 'char', 'vect__max_features': 100000, 'vect__ngram_range': (1, 5)}
0.90246 (+/-0.00000) for {'clf__C': 0.01, 'vect__analyzer': 'char', 'vect__max_featu

In [11]:
# linear word-gram, no tfidf

param_grid = {
    'vect__max_features': max_features, 
    'vect__ngram_range': ((1,2),),  
    'vect__analyzer' : ('word',),
    'clf__C' : C,
}

m = tune (X_train, y_train_ohv, X_dev, y_dev_ohv, alg, param_grid, n_iter, roc_scorer, n_jobs = 6, verbose = True)


Best parameters set found:
{'clf__C': 1, 'vect__analyzer': 'word', 'vect__max_features': 50000, 'vect__ngram_range': (1, 2)} 0.94125639939


Grid scores:
0.93139 (+/-0.00000) for {'clf__C': 0.1, 'vect__analyzer': 'word', 'vect__max_features': 5000, 'vect__ngram_range': (1, 2)}
0.93967 (+/-0.00000) for {'clf__C': 0.1, 'vect__analyzer': 'word', 'vect__max_features': 100000, 'vect__ngram_range': (1, 2)}
0.93643 (+/-0.00000) for {'clf__C': 10, 'vect__analyzer': 'word', 'vect__max_features': 10000, 'vect__ngram_range': (1, 2)}
0.80624 (+/-0.00000) for {'clf__C': 0.0001, 'vect__analyzer': 'word', 'vect__max_features': 5000, 'vect__ngram_range': (1, 2)}
0.80694 (+/-0.00000) for {'clf__C': 0.0001, 'vect__analyzer': 'word', 'vect__max_features': 100000, 'vect__ngram_range': (1, 2)}
0.89202 (+/-0.00000) for {'clf__C': 0.001, 'vect__analyzer': 'word', 'vect__max_features': 100000, 'vect__ngram_range': (1, 2)}
0.92744 (+/-0.00000) for {'clf__C': 0.01, 'vect__analyzer': 'word', 'vect__max_features

# With tfidf

In [None]:
alg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

time: 5.76 ms


In [None]:
# linear char-gram, tfidf

param_grid = {
    'vect__max_features': max_features, 
    'vect__ngram_range': ((1,5),),  
    'vect__analyzer' : ('char',),
    'tfidf__sublinear_tf' : (True, False),
    'tfidf__norm' : (None, 'l2'),
    'clf__C' : C,
}

m = tune (X_train, y_train_ohv, X_dev, y_dev_ohv, alg, param_grid, n_iter, roc_scorer, n_jobs = 6, verbose = True)

In [None]:
# linear word-gram, tfidf

param_grid = {
    'vect__max_features': max_features, 
    'vect__ngram_range': ((1,2),),  
    'vect__analyzer' : ('word',),
    'tfidf__sublinear_tf' : (True, False),
    'tfidf__norm' : (None, 'l2'),
    'clf__C' : C,
}

m = tune (X_train, y_train_ohv, X_dev, y_dev_ohv, alg, param_grid, n_iter, roc_scorer, n_jobs = 6, verbose = True)

TFIDF improves the ROC score for both types of ngram models although it gives a bigger boost for the char-ngram models.

# Tensorflow/Keras

Now we will cross-validate over model architectures (linear, mlp, lstm), ngram type (word, char), and label type (one hot or empirical distribution)

### Linear and MLP
The mlp model class actually includes linear models (just set hidden layers to be empty)

In [None]:
alg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('to_dense', DenseTransformer()), 
    ('clf', KerasClassifier(build_fn=make_mlp, output_dim = 2, verbose=False)),
])

dependencies = [( 'vect__max_features', 'clf__input_dim')]

In [None]:

char_vec_params = {
    'vect__max_features': (5000, 10000, 30000), 
    'vect__ngram_range': ((1,5),),  
    'vect__analyzer' : ('char',)
    }

word_vect_params = {
    'vect__max_features': (5000, 10000, 30000), 
    'vect__ngram_range': ((1,2),),  
    'vect__analyzer' : ('word',)
    }

tfidf_params = {
    'tfidf__sublinear_tf' : (True, False),
    'tfidf__norm' : ('l2',),
    }

linear_clf_params = {
    'clf__alpha' : (0.000000001, 0.0000001, 0.00001, 0.001, 0.01),
    'clf__hidden_layer_sizes' : ((),),
    'clf__nb_epoch' : (2,4,8,16),
    'clf__batch_size': (200,)
    }

mlp_clf_params = {
    'clf__alpha' : (0.000000001, 0.0000001, 0.00001, 0.001, 0.01),
    'clf__hidden_layer_sizes' : ((50,), (50, 50), (50, 50, 50)),
    'clf__nb_epoch' : (2,4,8,16),
    'clf__batch_size': (200,)
    }


In [None]:
for model in ['linear', 'mlp']:
    for gram in ['word', 'char']:
        for label in ['oh', 'ed']:
            params = {}
            
            if model == 'linear':
                params.update(linear_clf_params)
            else:
                params.update(mlp_clf_params)
                
            params.update(tfidf_params)
                
            if gram == 'char':
                params.update(char_vec_params)
            else:
                params.update(word_vect_params)
                
            if label == 'oh':
                y_train = y_train_ohm
                y_dev = y_dev_ohm
            else:
                y_train = y_train_ed
                y_dev = y_dev_ed
            
            print('\n\n\n %s %s %s' % (model, gram, label))
            cv = tune (X_train, y_train, X_dev, y_dev,
                      alg, params,
                      n_iter,
                      roc_scorer,
                      n_jobs = 1,
                      verbose = True,
                      dependencies = dependencies)
            
            save_best_estimator(cv, path, '%s_%s_%s' % (model, gram, label))
            est = get_best_estimator(cv)
            est.fit(X_train, y_train)
            
            best_spearman = spearman_scorer(est, X_dev, y_dev_ed) * 100
            print ("\n best spearman: ", best_spearman)
            best_roc = max(cv.grid_scores_, key=lambda x: x[1])[1] * 100
            print ("\n best roc: ", best_roc)
            
            results_list.append({'model_type': model,
                                 'ngram_type': gram,
                                 'label_type' : label,
                                 'cv': cv.grid_scores_,
                                 'best_roc': round(best_roc, 3),
                                 'best_spearman': round(best_spearman, 3)
                                })

In [None]:
pd.DataFrame(results_list)

## LSTM

In [86]:
alg = Pipeline([
    ('seq', SequenceTransformer()),
    ('clf', KerasClassifier(build_fn=make_lstm, output_dim = 2, verbose=True)),
])

dependencies = [( 'seq__max_features', 'clf__max_features'),
                ( 'seq__max_len', 'clf__max_len')]

time: 138 ms


In [87]:
word_seq_params = {
    'seq__max_features' : (5000, 10000, 30000),
    'seq__max_len' : (100, 200, 500),
    'seq__analyzer' : ('word',)
}

char_seq_params = {
    'seq__max_features' : (100,),
    'seq__max_len' : (200, 500, 1000),
    'seq__analyzer' : ('char',)
}

clf_params = {
    'clf__dropout' : (0.1, 0.2, 0.4),
    'clf__embedding_size' : (64, 128),
    'clf__lstm_output_size': (64, 128),
    'clf__nb_epoch' : (2,3,4),
    'clf__batch_size': (200,)
}

time: 104 ms


In [88]:
from pprint import pprint

time: 27.7 ms


In [89]:
model = 'lstm'
for gram in ['word', 'char']:
    for label in ['oh', 'ed']:
        params = {}
        params.update(clf_params)

        if gram == 'char':
            params.update(char_seq_params)
        else:
            params.update(word_seq_params)

        if label == 'oh':
            y_train = y_train_ohm
            y_dev = y_dev_ohm
        else:
            y_train = y_train_ed
            y_dev = y_dev_ed
            
        pprint(params)

        print('\n\n\n %s %s %s' % (model, gram, label))
        cv = tune (X_train, y_train, X_dev, y_dev,
                  alg, params,
                  n_iter,
                  roc_scorer,
                  n_jobs = 1,
                  verbose = True,
                  dependencies = dependencies)

        save_best_estimator(cv, path, '%s_%s_%s' % (model, gram, label))
        est = get_best_estimator(cv)
        est.fit(X_train, y_train)
        
        best_spearman = spearman_scorer(est, X_dev, y_dev_ed) * 100
        print ("\n best spearman: ", best_spearman)
        best_roc = max(cv.grid_scores_, key=lambda x: x[1])[1] * 100
        print ("\n best roc: ", best_roc)

        results_list.append({'model_type': model,
                             'ngram_type': gram,
                             'label_type' : label,
                             'cv': cv.grid_scores_,
                             'best_roc': round(best_roc, 3),
                             'best_spearman': round(best_spearman, 3)
                            })

{'clf__batch_size': (200,),
 'clf__dropout': (0.1, 0.2, 0.4),
 'clf__embedding_size': (64, 128),
 'clf__lstm_output_size': (64, 128),
 'clf__nb_epoch': (2, 3, 4),
 'seq__analyzer': ('word',),
 'seq__max_features': (5000, 10000, 30000),
 'seq__max_len': (100, 200, 500)}



 lstm word oh
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/3
Epoch 2/3
Epoch 3/3

Best parameters set found:
{'clf__lstm_output_size': 128, 'seq__max_len': 100, 'seq__max_features': 30000, 'clf__embedding_size': 64, 'clf__dropout': 0.1, 'clf__nb_epoch': 2, 'clf__batch_size': 200, 'clf__max_len': 100, 'seq__analyzer': 'word', 'clf__max_features': 30000}

KeyboardInterrupt: 

time: 19h 51min 17s


# Conv LSTM

In [81]:
alg = Pipeline([
    ('seq', SequenceTransformer()),
    ('clf', KerasClassifier(build_fn=make_conv_lstm, output_dim = 2, verbose=True)),
])

dependencies = [( 'seq__max_features', 'clf__max_features'),
                ( 'seq__max_len', 'clf__max_len')]

time: 2.3 ms


In [84]:
word_seq_params = {
    'seq__max_features' : (5000, 10000, 30000),
    'seq__max_len' : (100, 200, 500),
    'seq__analyzer' : ('word',),
    'clf__filter_length': (2, 4, 6),
    'clf__pool_length' : (2, 4, 6)
}

char_seq_params = {
    'seq__max_features' : (100,),
    'seq__max_len' : (200, 500, 1000),
    'seq__analyzer' : ('char',),
    'clf__filter_length': (5, 10, 15),
    'clf__pool_length' : (5, 10, 15)
}

clf_params = {
    'clf__dropout' : (0.1, 0.2, 0.4),
    'clf__embedding_size' : (64, 128),
    'clf__lstm_output_size': (64, 128),
    'clf__nb_epoch' : (2,3,4),
    'clf__batch_size': (200,),
    'clf__nb_filter' : (64, 128),
    
}

time: 6.62 ms


In [85]:
model = 'conv_lstm'
for gram in ['word', 'char']:
    for label in ['oh', 'ed']:
        params = {}
        params.update(clf_params)

        if gram == 'char':
            params.update(char_seq_params)
        else:
            params.update(word_seq_params)

        if label == 'oh':
            y_train = y_train_ohm
            y_dev = y_dev_ohm
        else:
            y_train = y_train_ed
            y_dev = y_dev_ed
            
        pprint(params)

        print('\n\n\n %s %s %s' % (model, gram, label))
        cv = tune (X_train, y_train, X_dev, y_dev,
                  alg, params,
                  n_iter,
                  roc_scorer,
                  n_jobs = 1,
                  verbose = True,
                  dependencies = dependencies)

        save_best_estimator(cv, path, '%s_%s_%s' % (model, gram, label))
        est = get_best_estimator(cv)
        est.fit(X_train, y_train)
        
        best_spearman = spearman_scorer(est, X_dev, y_dev_ed) * 100
        print ("\n best spearman: ", best_spearman)
        best_roc = max(cv.grid_scores_, key=lambda x: x[1])[1] * 100
        print ("\n best roc: ", best_roc)

        results_list.append({'model_type': model,
                             'ngram_type': gram,
                             'label_type' : label,
                             'cv': cv.grid_scores_,
                             'best_roc': round(best_roc, 3),
                             'best_spearman': round(best_spearman, 3)
                            })

{'clf__batch_size': (200,),
 'clf__dropout': (0.1, 0.2, 0.4),
 'clf__embedding_size': (64, 128),
 'clf__filter_length': (5, 10, 15),
 'clf__lstm_output_size': (64, 128),
 'clf__nb_epoch': (2, 3, 4),
 'clf__nb_filter': (64, 128),
 'clf__pool_length': (5, 10, 15),
 'seq__analyzer': ('char',),
 'seq__max_features': (100,),
 'seq__max_len': (500,)}



 conv_lstm char oh
Epoch 1/2
 7800/69490 [==>...........................] - ETA: 535s - loss: 0.3815 - acc: 0.8628

KeyboardInterrupt: 

time: 13min 37s
