In [1]:
from __future__ import division
import sys
import argparse
import configparser
import json
import h5py as h5
import os
from os.path import isfile
from joblib import Parallel, delayed
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

sys.path.append('../../Utils')
from utils import print_timestamped_message
sys.path.append('../WAC_Utils')
from wac_utils import filter_X_by_filelist, filter_refdf_by_filelist
from wac_utils import filter_relational_expr
from wac_utils import create_word2den, make_X_id_index, make_mask_matrix
from wac_utils import train_this_word, get_X_for_word

In [2]:
config = configparser.ConfigParser()
my_config = '../../Config/default.cfg'
with open(my_config, 'r', encoding='utf-8') as f:
    config.read_file(f)

dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')
preproc_path = dsgv_home + '/Preproc/PreprocOut/'
feats_path = dsgv_home + '/ExtractFeats/ExtractOut/'

In [3]:
# Define classifier
"""
Options for classifier:     LogisticRegression
                            SGDClassifier
                            
Options for LogisticRegression solver:
        L1:  liblinear, saga
        L2:  lbfgs, liblinear, saga
"""
classifier = linear_model.LogisticRegression
classf_params = {
    'penalty': 'l2', 
    'warm_start': True,
    'solver': 'lbfgs',
    'max_iter': 500
}

# Model description:
model = {
    'wrdl':  'min', # wordlist: minimal n occurrences...
    'wprm':  40,   # ... 40 times
    'clsf':  'logreg',
    'params': classf_params,
    'scaled': True,
    'nneg':  'balanced', # maximum neg instances
    'nsrc':  'randmax',# ... randomly selected
    'notes': ''
}


ID_FEATS = 3 # The first features in the image feature Xs encode the region ID
N_JOBS = 2  # how many threads to run in parallel during training

In [4]:
# ========================= DATA =================================
print_timestamped_message('loading up data.', indent=4)

with open(preproc_path + 'saiapr_90-10_splits.json', 'r') as f:
    s_splits = json.load(f)

# Image features
with h5.File(feats_path + 'saiapr_bbdf_rsn50-max.hdf5') as f:
    X = np.array(f["img_feats"])
    X_t = filter_X_by_filelist(X, s_splits['train'])

# Referring expressions
saiapr_refdf = pd.read_json(preproc_path + 'saiapr_refdf.json.gz',
                            typ='frame', orient='split', compression='gzip')

saiapr_train = filter_refdf_by_filelist(saiapr_refdf, s_splits['train'])
saiapr_train = filter_relational_expr(saiapr_train)

# ======================= Intermediate ==============================
print_timestamped_message('creating intermediate data structures', indent=4)

word2den = create_word2den(saiapr_train)
X_idx = make_X_id_index(X_t)
mask_matrix = make_mask_matrix(X_t, X_idx, word2den, word2den.keys())

# ======================= Wordlist ==============================
print_timestamped_message('selecting words to train models for', indent=4)

min_freq = model['wprm']
counts = mask_matrix.sum(axis=1)
wordlist = np.array(list(word2den.keys()))[counts > min_freq]

     [ 2021-08-06 @ 21:56:44 ] loading up data.
     [ 2021-08-06 @ 21:56:49 ] creating intermediate data structures
     [ 2021-08-06 @ 21:56:57 ] selecting words to train models for


In [5]:
def train_this_word2(X, word2den, mask_matrix, neg_max,
                    classifier, classf_params, this_word):
    X_this_w, y_this_w = get_X_for_word(X, word2den, mask_matrix, this_word)
    print('.', end='')
    sys.stdout.flush()
    clf = make_pipeline(StandardScaler(),
                        classifier(**classf_params))
    clf.fit(X_this_w, y_this_w)
    return (this_word, int(y_this_w.sum()), len(X_this_w), clf)

In [6]:
# ======================= TRAIN ==============================
print_timestamped_message('and training the %d WACs!' % (len(wordlist)), indent=4)

wacs = Parallel(n_jobs=N_JOBS, require='sharedmem', prefer='threads')\
               (delayed(train_this_word2)(X_t, word2den, mask_matrix, 
                                          model['nneg'],
                                          classifier, classf_params, 
                                          this_word)
                for this_word in wordlist[:5])

print('')  # newline, because train_this_word prints . as progress bar
print_timestamped_message('DONE!')

     [ 2021-08-06 @ 21:56:58 ] and training the 408 WACs!
.....
 [ 2021-08-06 @ 21:57:35 ] DONE!


In [7]:
# Validation
print(model)

for this_word, npos, _, this_clsf in wacs:
    X_tst, y_tst = get_X_for_word(X_t, word2den, mask_matrix, this_word)
    print(this_word, npos, '\t', this_clsf.score(X_tst, y_tst))

{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 'params': {'penalty': 'l2', 'warm_start': True, 'solver': 'lbfgs', 'max_iter': 500}, 'scaled': True, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.9308335682736585
very 1012 	 0.9434132876451552
top 5798 	 0.8446391193115745
guy 2033 	 0.9262923796123996
in 7223 	 0.7644638724607868


In [None]:
"""
{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 'penalty': 'L2', 'solver': 'lbfgs', 
'scaled': 'False', 'nneg': 5, 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.060426651630485856
very 1012 	 0.04825813820673901
top 5798 	 0.2247848670439569
guy 2033 	 0.09394998411473698
in 7223 	 0.2653638467472358
the 12257 	 0.3799795393247977
middle 3704 	 0.15630273371582856
front 1870 	 0.08564243255601281
upper 995 	 0.04743986663491308
right 13293 	 0.3993331931637281

{'wrdl': 'min', 'wprm': 40, 'clsf': logreg, 
'params': {'penalty': 'l2', 'warm_start': True, 'solver': 'lbfgs', 'max_iter': 500}, 
'scaled': False, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.5616013532562729
very 1012 	 0.6421568627450981
top 5798 	 0.7731607101325684
guy 2033 	 0.8038850814687061
in 7223 	 0.6648422289975389
the 12257 	 0.5882444120655982
middle 3704 	 0.649510631117111
front 1870 	 0.6480566986739826
upper 995 	 0.6662538699690402
right 13293 	 0.8308653470699546

{'wrdl': 'min', 'wprm': 40, 'clsf': logreg, 
'params': {'penalty': 'l1', 'warm_start': True, 'solver': 'liblinear', 'max_iter': 500}, 
'scaled': False, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.5699652288318767
very 1012 	 0.6271654292785075
top 5798 	 0.7765330645786495
guy 2033 	 0.7979394544546816
in 7223 	 0.6607647944752599
the 12257 	 0.5914995194841429
middle 3704 	 0.6650354370570368
front 1870 	 0.6750342935528121
upper 995 	 0.6997856632531555
right 13293 	 0.8350103625386718

{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 
'params': {'penalty': 'l1', 'warm_start': True, 'solver': 'saga', 'max_iter': 500}, 
'scaled': False, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': 'Took ages to run'}
at 1282 	 0.5761206653510008
very 1012 	 0.6339710641538169
top 5798 	 0.7405225211256686
guy 2033 	 0.8115553941814551
in 7223 	 0.6620504720273298
the 12257 	 0.5857023281768299
middle 3704 	 0.612470469119136
front 1870 	 0.6656149977137632
upper 995 	 0.6769707073112646
right 13293 	 0.8256390232180939

{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 'params': {'penalty': 'l2', 'warm_start': True, 'solver': 'lbfgs', 'max_iter': 500}, 'scaled': True, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.9315853773141622
very 1012 	 0.942175899486008
top 5798 	 0.8487479649585239
guy 2033 	 0.9266554713384468
in 7223 	 0.7635087977078205
the 12257 	 0.6357069783302849
middle 3704 	 0.8515862301721229
front 1870 	 0.9093735711019661
upper 995 	 0.9451297928078114
right 13293 	 0.8444718108911783
"""

In [8]:
# Get test data
X_ts = filter_X_by_filelist(X, s_splits['test'])
refdf_test = filter_refdf_by_filelist(saiapr_refdf, s_splits['test'])

In [9]:
sys.path.append('../../ApplyModels')
from apply_model import *

wac_dicts = {}
for this_word, npos, n, this_clsf in wacs:
    wac_dicts[this_word] = {'npos': npos, 'n': n, 'clsf': this_clsf}
    
results = eval_testdf(refdf_test, wac_dicts, X_ts[:100])
results

 26%|██▌       | 3090/12010 [00:00<00:00, 15764.03it/s]

[0.10479095 0.04162125 0.47354669 0.12323037 0.84501008 0.31373362
 0.71916427 0.35411171 0.58238099 0.70882224 0.71068944]
[0.18122706 0.20565396 0.27723284 0.16185127 0.42484992 0.24658902
 0.35749493 0.17637187 0.19216327]


 54%|█████▎    | 6441/12010 [00:00<00:00, 16380.08it/s]

[0.02116639 0.12925639 0.10060524 0.13057926]
[0.73434393 0.35779622 0.69502023 0.46600537 0.69508736 0.69301789
 0.59582114 0.24955255 0.41891994 0.26335985 0.18370524 0.62799687
 0.55073586 0.49267701]
[0.73434393 0.35779622 0.69502023 0.46600537 0.69508736 0.69301789
 0.59582114 0.24955255 0.41891994 0.26335985 0.18370524 0.62799687
 0.55073586 0.49267701]
[2.47967347e-03 6.90331187e-03 4.22017035e-04 2.81918017e-03
 5.84378372e-02 4.37989182e-01 3.98848775e-05 4.29953695e-05
 4.70022078e-03 6.81779829e-03 6.41044312e-03 1.72987859e-03
 2.97007223e-02 1.45964657e-02]
[0.73434393 0.35779622 0.69502023 0.46600537 0.69508736 0.69301789
 0.59582114 0.24955255 0.41891994 0.26335985 0.18370524 0.62799687
 0.55073586 0.49267701]
[0.04205685 0.00038443 0.08887916 0.00381218 0.01976109 0.01758622
 0.00904325 0.15338806 0.23912345 0.10302183 0.29149572 0.10328845
 0.01502773 0.17893491]
[0.73434393 0.35779622 0.69502023 0.46600537 0.69508736 0.69301789
 0.59582114 0.24955255 0.41891994 0.2633

100%|██████████| 12010/12010 [00:00<00:00, 16637.71it/s]

[0.1080756  0.25413358 0.05266036 0.07015937 0.05016108 0.24989
 0.44593562 0.70423399 0.05130146 0.21329333 0.33834645]
[4.50019002e-02 2.12486660e-02 9.97385291e-03 1.49768106e-04
 1.75681967e-03 2.37252808e-03 1.75154405e-04 1.09539753e-02
 3.70290339e-01 2.14161450e-01 6.29315023e-01]





Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp,tagged,cov,suc,rnk,nob,is_rel
0,0,14576,1,referit,2,seal,"[[seal, NN]]",0.0,False,,,False
1,0,14576,3,referit,81312,rock left of seal,"[[rock, NN], [left, VBD], [of, IN], [seal, NN]]",0.0,False,,,True
2,0,14576,2,referit,96551,bottom left corner,"[[bottom, NN], [left, VBD], [corner, NN]]",0.0,False,,,False
3,0,20909,1,referit,49,kid,"[[kid, NN]]",0.0,False,,,False
4,0,20909,3,referit,35225,dirt ground,"[[dirt, NN], [ground, NN]]",0.0,False,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...
12005,0,40508,1,referit,116219,blue shirt right,"[[blue, JJ], [shirt, NN], [right, NN]]",0.0,False,,,False
12006,0,17169,1,referit,116923,outside the moon,"[[outside, IN], [the, DT], [moon, NN]]",0.0,False,,,False
12007,0,30950,1,referit,118084,water,"[[water, NN]]",0.0,False,,,False
12008,0,32289,1,referit,118143,group of people at bottom,"[[group, NN], [of, IN], [people, NNS], [at, IN...",,,,,False


In [14]:
summarise_eval(results)

Unnamed: 0,acc,mrr,acv,rnd
0,0.069192,0.563565,0.052249,0.177456


In [25]:
results.loc[results.suc==True]

Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp,tagged,cov,suc,rnk,nob,is_rel
12,0,2050,11,referit,24785,guy in black jacket first,"[[guy, NN], [in, IN], [black, JJ], [jacket, NN...",0.400000,True,1.0,12.0,False
36,0,19314,8,referit,80527,sky top left,"[[sky, NN], [top, NN], [left, VBD]]",0.333333,True,1.0,8.0,False
41,0,31573,1,referit,29544,guy,"[[guy, NN]]",1.000000,True,1.0,6.0,False
52,0,9794,5,referit,26917,the guy that might be about to take a picture,"[[the, DT], [guy, NN], [that, WDT], [might, MD...",0.100000,True,1.0,8.0,False
59,0,9794,1,referit,70210,top,"[[top, NN]]",1.000000,True,1.0,8.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
11976,0,32378,1,referit,100283,top left,"[[top, JJ], [left, VBD]]",0.500000,True,1.0,1.0,False
11979,0,35884,2,referit,100643,top left,"[[top, JJ], [left, VBD]]",0.500000,True,1.0,2.0,False
11983,0,31967,2,referit,109702,grass very bottom,"[[grass, NN], [very, RB], [bottom, JJ]]",0.333333,True,1.0,2.0,False
12008,0,32289,1,referit,118143,group of people at bottom,"[[group, NN], [of, IN], [people, NNS], [at, IN...",0.200000,True,1.0,1.0,False


In [10]:
word2den_ts = create_word2den(refdf_test)
X_idx_ts = make_X_id_index(X_ts)
mask_matrix_ts = make_mask_matrix(X_ts, X_idx_ts, word2den_ts, word2den_ts.keys())

In [14]:
# Testing
for this_word, npos, _, this_clsf in wacs:
    X_tst, y_tst = get_X_for_word(X_ts, word2den_ts, mask_matrix_ts, this_word, neg_max='balanced')
    print(this_word, npos, '\t', 
          this_clsf.score(X_tst, y_tst))

at 1282 	 0.525
very 1012 	 0.5630252100840336
top 5798 	 0.7288135593220338
guy 2033 	 0.6478658536585366
in 7223 	 0.5981220657276995
the 12257 	 0.5431841831425598
middle 3704 	 0.5838206627680312
front 1870 	 0.5470085470085471
upper 995 	 0.5560344827586207
right 13293 	 0.814773980154355


In [50]:
wordinfo = [e[:-1] for e in wacs]
type(wordinfo[0][0])


numpy.str_

In [34]:
classf_params = {
        'penalty': 'l2',
        'warm_start': True,
        'solver': 'lbfgs',
        'max_iter': 500
}

# Model description:
model = {
        'rcorp': 'referit',        # ref corpus
        'cnn': 'rsn50-max',        # CNN used for vision feats
        'rel':   'excl',           # exclude relational expressions
        'wrdl':  'min',            # wordlist: minimal n occurrences...
        'wprm':  40,               # ... 40 times
        'clsf':  'logreg-l1',      # logistic regression, l1 regularized
        'params': classf_params,
        'scaled': True,
        'nneg':  'balanced',                # maximum neg instances
        'nsrc':  'randmax',        # ... randomly selected
        'notes': ''
}
    


In [51]:
with open('test.json', 'w') as f:
    json.dump((model, wordinfo), f)