In [48]:
from __future__ import division
import json
import h5py as h5
from collections import Counter, defaultdict
import configparser
import sys
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.utils import shuffle

from joblib import Parallel, delayed

In [2]:
#%load_ext line_profiler

In [38]:
config = configparser.ConfigParser()
my_config = '../../Config/default.cfg'
with open(my_config, 'r', encoding='utf-8') as f:
    config.read_file(f)

corpora_base = config.get('DEFAULT', 'corpora_base')
dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')

preproc_path = dsgv_home + '/Preproc/PreprocOut/'
feats_path = dsgv_home + '/ExtractFeats/ExtractOut/'

# The first features in the image feature Xs encode the region ID
ID_FEATS = 3

In [39]:
with h5.File(feats_path + 'saiapr_bbdf_rsn50-max.hdf5') as f:
    n1 = np.array(f["img_feats"])

n1.shape


(99527, 2058)

In [40]:
with open(preproc_path + 'saiapr_90-10_splits.json', 'r') as f:
    s_splits = json.load(f)
    
# X = np.load(feats_path + 'mscoco_vgg19-fc2.npz')['arr_0']
with h5.File(feats_path + 'saiapr_bbdf_rsn50-max.hdf5') as f:
    X = np.array(f["img_feats"])

saiapr_refdf = pd.read_json(preproc_path + 'saiapr_refdf.json.gz',
                         typ='frame', orient='split', compression='gzip')

In [41]:
def filter_X_by_filelist(X, filelist):
    if type(X) == np.ndarray:
        tmp_df = pd.DataFrame(X)
        return np.array(tmp_df[tmp_df.iloc[:, 1].isin(filelist)])
    else:  # assume that X is a dask array
        image_id_list = X[:, 1].compute()
        train_mask = np.isin(image_id_list, filelist)
        return X[train_mask]

def filter_refdf_by_filelist(refdf, filelist):
    return pd.merge(refdf, pd.DataFrame(filelist, columns=['image_id']))

In [42]:
X_t = filter_X_by_filelist(X, s_splits['train'])
refdf_train = filter_refdf_by_filelist(saiapr_refdf, s_splits['train'])
refdf_train

Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp,tagged
0,0,8756,2,referit,0,sunray at very top,"[[sunray, NN], [at, IN], [very, RB], [top, JJ]]"
1,0,8756,10,referit,1515,guy in the middle in front,"[[guy, NN], [in, IN], [the, DT], [middle, NN],..."
2,0,8756,9,referit,2252,upper right corner,"[[upper, JJ], [right, NN], [corner, NN]]"
3,0,8756,6,referit,5631,any of the four people on left,"[[any, DT], [of, IN], [the, DT], [four, CD], [..."
4,0,8756,1,referit,10052,sky top left,"[[sky, NN], [top, NN], [left, VBD]]"
...,...,...,...,...,...,...,...
108066,0,24719,1,referit,119562,head,"[[head, NN]]"
108067,0,31705,1,referit,119566,jersey,"[[jersey, NN]]"
108068,0,40528,1,referit,119624,right middle,"[[right, RB], [middle, NN]]"
108069,0,30772,1,referit,120023,tall tree,"[[tall, DT], [tree, NN]]"


In [43]:
def create_word2den(refdf, refcol='refexp', regcol='region_id'):
    '''Given refdf, returns dict of occurences (id triples) of words from expressions.'''
    word2den = defaultdict(list)
    for _, row in refdf.iterrows():
        exprlist = row[refcol].split()
        # TODO: Could take filter function that filters out some occurences.
        #   E.g., tagger that tags whole expression & returns only the nouns.
        for word in exprlist:
            word_den_list = word2den[word].append((row['i_corpus'],
                                                   row['image_id'],
                                                   row[regcol]))
    return {k: list(set(v)) for k,v in word2den.items()}

In [44]:
%%time
word2den = create_word2den(refdf_train)

CPU times: user 7.12 s, sys: 43.9 ms, total: 7.16 s
Wall time: 7.16 s


In [10]:
# word2den is an index of vocab to list of occurrences (i_corpus, image_id, region_id)
word2den['gorilla']

[(0, 40630, 1), (0, 40628, 3), (0, 40630, 3), (0, 40628, 1), (0, 40630, 2)]

In [45]:
def make_X_id_index(X, id_feats=ID_FEATS):
    return dict(zip([tuple(e) for e in X[:,:id_feats].astype(int).tolist()], range(len(X))))

In [46]:
%%time
X_idx = make_X_id_index(X_t)

CPU times: user 74.5 ms, sys: 291 ms, total: 365 ms
Wall time: 480 ms


In [47]:
len(X_idx)

89536

In [14]:
def make_mask_matrix(X, X_idx, word2den, wordlist):
    mask_matrix = []
    for this_word in wordlist:
        this_word_vec = np.zeros(len(X))
        if this_word in word2den:
            this_word_vec[[X_idx[i] for i in word2den[this_word] if i in X_idx]] = 1
        mask_matrix.append(this_word_vec)
    mask_matrix = np.array(mask_matrix, dtype=bool)
    return mask_matrix

In [15]:
%%time
mask_matrix = make_mask_matrix(X_t, X_idx, word2den, word2den.keys())

CPU times: user 1.58 s, sys: 2.64 s, total: 4.22 s
Wall time: 6.46 s


In [16]:
mask_matrix

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [17]:
## N.B.: Replace with make_X_for_word from below! Can be used for extracting
##   test data as well..

def make_train_for_word(X, word2den, mask_matrix, word, neg_max=20000):
    if word not in word2den:
        #raise ValueError("No mask available for this word! (%s)" % (word))
        print("Error!! No mask available for this word! (%s)" % (word))
        return None
    this_mask = mask_matrix[list(word2den.keys()).index(word)]
    #this_mask = mask_matrix[list(word2den)[word]]
    X_pos = X[this_mask, ID_FEATS:]
    y_pos = np.ones(len(X_pos), dtype=int)
    
    neg_indx = np.arange(mask_matrix.shape[1])[~this_mask]
    np.random.shuffle(neg_indx)
    X_neg = X[neg_indx[:neg_max], ID_FEATS:]
    y_neg = np.zeros(len(X_neg), dtype=int)

    X_out = np.concatenate([X_pos, X_neg], axis=0)
    y_out = np.concatenate([y_pos, y_neg])
    return shuffle(X_out, y_out)

In [18]:
#%lprun -T prof1 -f make_train_for_word X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, 'cow')

Sped up `make_train_for_word` by limiting the size of the negative set. Was 40secs, now 3 secs. Still slower than I would like. But selecting a very large portion of the matrix with a boolean vector seems to be very slow. Maybe there is a more clever way to do it?

In [19]:
%%time
X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, 'cow')

CPU times: user 141 ms, sys: 429 ms, total: 571 ms
Wall time: 580 ms


In [20]:
mask_matrix.shape

(9497, 89545)

Reduce the set of words for which WAC is trained, by frequency:

In [21]:
min_freq = 40

counts = mask_matrix.sum(axis=1)

wordlist = np.array(list(word2den.keys()))[counts > min_freq]
len(wordlist)

512

In [22]:
X_t

array([[0.00000000e+00, 1.12000000e+02, 1.00000000e+00, ...,
        6.61273148e-01, 1.33333333e+00, 1.07406290e-01],
       [0.00000000e+00, 1.12000000e+02, 2.00000000e+00, ...,
        4.94444444e-02, 1.33333333e+00, 7.78918410e-01],
       [0.00000000e+00, 1.12000000e+02, 3.00000000e+00, ...,
        7.77719907e-02, 1.33333333e+00, 7.31304010e-01],
       ...,
       [0.00000000e+00, 4.06880000e+04, 1.00000000e+00, ...,
        5.06938657e-01, 1.33333333e+00, 1.13749237e-01],
       [0.00000000e+00, 4.06890000e+04, 1.00000000e+00, ...,
        1.53125000e-01, 1.33333333e+00, 2.45837571e-01],
       [0.00000000e+00, 4.06890000e+04, 2.00000000e+00, ...,
        2.12245370e-01, 1.33333333e+00, 5.66580876e-01]])

In [23]:
# Scaling
scaling = False
if scaling:
    from sklearn import preprocessing
    scaler = preprocessing.StandardScaler().fit(X_t)
    X_scaled = scaler.transform(X_t)
    X_scaled

In [90]:
def train_this_word(X, word2den, mask_matrix, this_word):
    X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, this_word)
    print(this_word, X_this_w.shape[0])
    classifier = linear_model.LogisticRegression(penalty='l2', warm_start=True, max_iter=400)
    this_wac = classifier.fit(X_this_w, y_this_w)
    return (this_word, y_this_w.sum(), len(X_this_w), this_wac)

In [None]:
%%time
wacs = [train_this_word(X, word2den, mask_matrix, this_word)\
        for this_word in wordlist[:10]]

In [None]:
%%time
wacs = Parallel(n_jobs=2, require='sharedmem', prefer='threads')\
               (delayed(train_this_word)(X, word2den, mask_matrix, this_word)\
                for this_word in wordlist[:10])

Distributing over two cores seems to be worth it. Diminishing returns for more cores.

Could still try to train on keras? https://gist.github.com/fchollet/b7507f373a3446097f26840330c1c378

In [80]:
wacs[0]

('at', 1471, 21471, LogisticRegression(max_iter=400, warm_start=True))

Remaining tasks:

* evaluation? Run models on training data (with smaller n_neg... maybe balanced? should be option in make_train... which might better be called make_word_dataset...)
* how to persist models.. Write out weight matrix and wordlist to disk, as numpy structures? scikit learn objects not very well serialisable?

In [36]:
def get_X_for_word(X, word2den, mask_matrix, word, neg_max=5):

    if word not in word2den:
        # raise ValueError("No mask available for this word! (%s)" % (word))
        print("Error!! No mask available for this word! (%s)" % (word))
        return None
    this_mask = mask_matrix[list(word2den.keys()).index(word)]
    X_pos = X[this_mask, ID_FEATS:]
    y_pos = np.ones(len(X_pos), dtype=int)

    # print('made it here!', X_pos.shape)

    if neg_max == 0:
        return X_pos, y_pos
    
    if neg_max == 'balanced':
        neg_max = len(y_pos)

    neg_indx = np.arange(mask_matrix.shape[1])[~this_mask]
    np.random.shuffle(neg_indx)
    X_neg = X[neg_indx[:neg_max], ID_FEATS:]
    y_neg = np.zeros(len(X_neg), dtype=int)

    X_out = np.concatenate([X_pos, X_neg], axis=0)
    y_out = np.concatenate([y_pos, y_neg])
    return shuffle(X_out, y_out)

In [29]:
def train_this_word(X, word2den, mask_matrix, classf_params, this_word):
    X_this_w, y_this_w = get_X_for_word(X, word2den, mask_matrix, this_word)
    print(this_word, X_this_w.shape[0])
    
    clf = make_pipeline(StandardScaler(),
                        SGDClassifier(**classf_params))
    clf.fit(X_this_w, y_this_w)
    return (this_word, y_this_w.sum(), len(X_this_w), clf)

In [32]:


classf_params = {'penalty': 'l1', 'warm_start': True}

In [37]:
%%time
wacs = Parallel(n_jobs=2, require='sharedmem', prefer='threads')\
               (delayed(train_this_word)(X, word2den, mask_matrix, classf_params, this_word)\
                for this_word in wordlist[:10])

IndexError: boolean index did not match indexed array along dimension 0; dimension is 99527 but corresponding boolean dimension is 89545

In [None]:
for this_word, npos, _, this_clsf in wacs:
    X_tst, y_tst = get_X_for_word(X_t, word2den, mask_matrix, this_word)
    print(this_word, npos, '\t', 
          this_clsf.score(X_tst, y_tst))

In [93]:
for this_word, npos, _, this_clsf in wacs:
    X_tst, y_tst = get_X_for_word(X_t, word2den, mask_matrix, this_word)
    print(this_word, npos, '\t', 
          this_clsf.score(X_tst, y_tst))

at 1471 	 0.5720598232494901
very 1127 	 0.6548358473824313
top 6202 	 0.766446307642696
guy 2804 	 0.7378744650499287
in 9546 	 0.6580243033731406
the 17007 	 0.6033104015993415
middle 4676 	 0.6408254918733961
front 2992 	 0.6415441176470589
upper 1086 	 0.7343462246777164
right 16454 	 0.8317430412057858


Performance on training data (!) unsuprisingly pretty good...

In [28]:
rc_all_test = rc_splits['testA'] + rc_splits['testB']
X_ts = filter_X_by_filelist(X, rc_all_test)
refdf_test = filter_refdf_by_filelist(refcoco_refdf, rc_all_test)

In [29]:
word2den_ts = create_word2den(refdf_test)
X_idx_ts = make_X_id_index(X_ts)
mask_matrix_ts = make_mask_matrix(X_ts, X_idx_ts, word2den_ts, word2den_ts.keys())

In [60]:
for this_word, npos, _, this_clsf in wacs:
    print this_word, npos, '\t',
    X_tst, y_tst = get_X_for_word(X_ts, word2den_ts, mask_matrix_ts, this_word, neg_max='balanced')
    print this_clsf.score(X_tst, y_tst)

yellow 896 	0.6483516483516484
wooden 53 	0.6
hanging 59 	0.5
second 1922 	0.6525
kids 56 	0.5
glass 533 	0.7073170731707317
hot 186 	0.65
wine 184 	0.71875
backpack 100 	0.5833333333333334
silver 123 	0.5


In [30]:
for this_word, npos, _, this_clsf in wacs:
    print this_word, npos, '\t',
    X_tst, y_tst = get_X_for_word(X_ts, word2den_ts, mask_matrix_ts, this_word, neg_max='balanced')
    print this_clsf.score(X_tst, y_tst)

yellow 896 	0.6813186813186813
wooden 53 	0.6
hanging 59 	0.5
second 1922 	0.5925
kids 56 	0.55
glass 533 	0.7195121951219512
hot 186 	0.675
wine 184 	0.75
backpack 100 	0.5833333333333334
silver 123 	0.5


It's looking not at all so great on the test set... (Although this is not the evaluation that is of ultimate interest here.)

TODO:
- persisting the trained model... As weight matrix? (Together with wordlist & other interesting data.)

In [39]:
this_wac = wacs[0][3]

In [42]:
this_wac.intercept_

array([-0.53120489])

In [45]:
np.stack([np.append(this_wac.coef_, this_wac.intercept_) \
          for this_wac in [w[3] for w in wacs]]).shape
    

(10, 2056)

In [46]:
sys.stdout.flush()

In [50]:
for this_wac in zip([e[:-1] for e in wacs]):
    print this_wac

((u'yellow', 896, 20896),)
((u'wooden', 53, 20053),)
((u'hanging', 59, 20059),)
((u'second', 1922, 21922),)
((u'kids', 56, 20056),)
((u'glass', 533, 20533),)
((u'hot', 186, 20186),)
((u'wine', 184, 20184),)
((u'backpack', 100, 20100),)
((u'silver', 123, 20123),)


In [53]:
wl = [e[:-1] for e in wacs]

In [55]:
    model = {
        'rcorp': 'refcoco',        # ref corpus
        'cnn': 'rsn50-flatten_1',  # CNN used for vision feats
        'rel':   'excl',           # exclude relational expressions
        'wrdl':  'min',            # wordlist: minimal n occurrences...
        'wprm':  40,               # ... 40 times
        'clsf':  'logreg-l1',      # logistic regression, l1 regularized
        'nneg':  20000,            # maximally 20k neg instances
        'nsrc':  'randmax',        # ... randomly selected
        'notes': ''
    }

In [56]:
json.dumps((model, wl))

'[{"nneg": 20000, "rcorp": "refcoco", "clsf": "logreg-l1", "rel": "excl", "cnn": "rsn50-flatten_1", "notes": "", "wrdl": "min", "nsrc": "randmax", "wprm": 40}, [["yellow", 896, 20896], ["wooden", 53, 20053], ["hanging", 59, 20059], ["second", 1922, 21922], ["kids", 56, 20056], ["glass", 533, 20533], ["hot", 186, 20186], ["wine", 184, 20184], ["backpack", 100, 20100], ["silver", 123, 20123]]]'

In [57]:
weightmatrix_ld = np.load('../ModelsOut/mod01_refcoco.npz')

In [58]:
weightmatrix = weightmatrix_ld['arr_0']

In [59]:
weightmatrix.shape

(10, 2056)

In [60]:
X.shape

(196118, 2058)