In [1]:
from __future__ import division
import sys
import configparser
import json
import h5py as h5
import gzip
import pickle
import os
from os.path import isfile
from joblib import Parallel, delayed
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

sys.path.append('../../Utils')
from utils import print_timestamped_message
sys.path.append('../WAC_Utils')
from wac_utils import filter_X_by_filelist, filter_refdf_by_filelist
from wac_utils import filter_relational_expr
from wac_utils import create_word2den, make_X_id_index, make_mask_matrix
from wac_utils import train_this_word, get_X_for_word

In [2]:
config = configparser.ConfigParser()
my_config = '../../Config/default.cfg'
with open(my_config, 'r', encoding='utf-8') as f:
    config.read_file(f)

dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')
preproc_path = dsgv_home + '/Preproc/PreprocOut/'
feats_path = dsgv_home + '/ExtractFeats/ExtractOut/'
model_path = dsgv_home + '/WACs/ModelsOut/'

In [3]:
# Define classifier
"""
Options for classifier:     LogisticRegression
                            SGDClassifier
                            
Options for LogisticRegression solver:
        L1:  liblinear, saga
        L2:  lbfgs, liblinear, saga
"""
classifier = linear_model.LogisticRegression
classf_params = {
    'penalty': 'l2', 
    'warm_start': True,
    'solver': 'lbfgs',
    'max_iter': 500
}

# Model description:
model = {
    'wrdl':  'min', # wordlist: minimal n occurrences...
    'wprm':  40,   # ... 40 times
    'clsf':  'logreg',
    'params': classf_params,
    'scaled': True,
    'nneg':  2000, # maximum neg instances
    'nsrc':  'randmax',# ... randomly selected
    'notes': ''
}


ID_FEATS = 3 # The first features in the image feature Xs encode the region ID
N_JOBS = 2  # how many threads to run in parallel during training

In [4]:
# ========================= DATA =================================
print_timestamped_message('loading up data.', indent=4)

with open(preproc_path + 'fr_splits.json', 'r') as f:
    splits = json.load(f)

# Image features
with h5.File(feats_path + 'saiapr_bbdf_rsn50-max.hdf5') as f:
    X = np.array(f["img_feats"])
X_tr = filter_X_by_filelist(X, splits['train'])
print('X_tr shape:', X_tr.shape)

refdf = pd.read_pickle(preproc_path + 'FR_small_dataset.pkl')
print('refdf shape:', refdf.shape)

refdf_tr = filter_refdf_by_filelist(refdf, splits['train'])
print('Training dataset:')
refdf_tr

     [ 2021-08-13 @ 23:32:30 ] loading up data.
X_tr shape: (2569, 2058)
refdf shape: (10159, 7)
Training dataset:


Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp_EN,refexp
0,0,14576,1,referit,2,seal,phoque
1,0,14576,2,referit,96551,bottom left corner,coin inférieur gauche
2,0,25137,2,referit,71,bottom right con\rner,coin en bas à droite
3,0,25137,1,referit,83240,plane,avion
4,0,26897,1,referit,86,door,porte
...,...,...,...,...,...,...,...
2570,0,39495,1,referit,112761,sign,panneau
2571,0,31742,1,referit,114345,green part of court,partie verte de la cour
2572,0,10443,1,referit,115168,sky,ciel
2573,0,10443,2,referit,119956,any of the hills/cliffs,l'une des collines/falaises


In [5]:
X_tr[0]

array([ 0.        , 63.        ,  1.        , ...,  0.7905787 ,
        1.33333333,  0.12166667])

In [10]:
# Get activations from EN classifier for features
# - load classifier
with gzip.open(model_path + 'wac_EN_3.pklz', 'r') as rf:
    en_wacs = pickle.load(rf)

en_wacs

{'at': {'npos': 1279,
  'n': 21279,
  'clsf': Pipeline(steps=[('scaler', StandardScaler()),
                  ('clf', LogisticRegression(max_iter=500, warm_start=True))])},
 'very': {'npos': 1011,
  'n': 21011,
  'clsf': Pipeline(steps=[('scaler', StandardScaler()),
                  ('clf', LogisticRegression(max_iter=500, warm_start=True))])},
 'top': {'npos': 5794,
  'n': 25794,
  'clsf': Pipeline(steps=[('scaler', StandardScaler()),
                  ('clf', LogisticRegression(max_iter=500, warm_start=True))])},
 'guy': {'npos': 2032,
  'n': 22032,
  'clsf': Pipeline(steps=[('scaler', StandardScaler()),
                  ('clf', LogisticRegression(max_iter=500, warm_start=True))])},
 'in': {'npos': 7220,
  'n': 27220,
  'clsf': Pipeline(steps=[('scaler', StandardScaler()),
                  ('clf', LogisticRegression(max_iter=500, warm_start=True))])},
 'the': {'npos': 12231,
  'n': 32231,
  'clsf': Pipeline(steps=[('scaler', StandardScaler()),
                  ('clf', LogisticReg

In [12]:
response_vectors = []
for word in 'the brown hat'.split():
    response_vector = np.array(en_wacs[word]['clsf'].predict_proba(X=X_tst)[:,1])
response_vector

NameError: name 'X_tst' is not defined

In [None]:
# ======================= Intermediate ==============================
print_timestamped_message('creating intermediate data structures', indent=4)

word2den = create_word2den(refdf_tr)
X_idx = make_X_id_index(X_tr)
mask_matrix = make_mask_matrix(X_tr, X_idx, word2den, word2den.keys())

# ======================= Wordlist ==============================
print_timestamped_message('selecting words to train models for',
                          indent=4)
min_freq = model['wprm']
counts = mask_matrix.sum(axis=1)
wordlist = np.array(list(word2den.keys()))[counts > min_freq]

In [9]:
def train_this_word2(X, word2den, mask_matrix, neg_max,
                    classifier, classf_params, this_word):
    X_this_w, y_this_w = get_X_for_word(X, word2den, mask_matrix, this_word)
    print('.', end='')
    sys.stdout.flush()
    clf = make_pipeline(StandardScaler(),
                        classifier(**classf_params))
    clf.fit(X_this_w, y_this_w)
    return (this_word, int(y_this_w.sum()), len(X_this_w), clf)

In [10]:
# ======================= TRAIN ==============================
print_timestamped_message('and training the %d WACs!' % (len(wordlist)), indent=4)

wacs = Parallel(n_jobs=N_JOBS, require='sharedmem', prefer='threads')\
               (delayed(train_this_word2)(X_t, word2den, mask_matrix, 
                                          model['nneg'],
                                          classifier, classf_params, 
                                          this_word)
                for this_word in wordlist[:10])

print('')  # newline, because train_this_word prints . as progress bar
print_timestamped_message('DONE!')

     [ 2021-08-12 @ 12:04:31 ] and training the 406 WACs!
..........
 [ 2021-08-12 @ 12:05:48 ] DONE!


In [11]:
# Validation
print(model)

for this_word, npos, _, this_clsf in wacs:
    X_tst, y_tst = get_X_for_word(X_t, word2den, mask_matrix, this_word)
    print(this_word, npos, '\t', this_clsf.score(X_tst, y_tst))

{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 'params': {'penalty': 'l2', 'warm_start': True, 'solver': 'lbfgs', 'max_iter': 500}, 'scaled': True, 'nneg': 5, 'nsrc': 'randmax', 'notes': ''}
at 1279 	 0.9309648009774896
very 1011 	 0.9386987768311836
top 5794 	 0.8473676048693495
guy 2032 	 0.9252904865649964
in 7220 	 0.7632255694342396
the 12231 	 0.6369023610809469
middle 3703 	 0.8509049487406657
front 1870 	 0.9120713305898491
upper 995 	 0.9424148606811146
right 13291 	 0.8450331921540356


In [None]:
"""
{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 'penalty': 'L2', 'solver': 'lbfgs', 
'scaled': 'False', 'nneg': 5, 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.060426651630485856
very 1012 	 0.04825813820673901
top 5798 	 0.2247848670439569
guy 2033 	 0.09394998411473698
in 7223 	 0.2653638467472358
the 12257 	 0.3799795393247977
middle 3704 	 0.15630273371582856
front 1870 	 0.08564243255601281
upper 995 	 0.04743986663491308
right 13293 	 0.3993331931637281

{'wrdl': 'min', 'wprm': 40, 'clsf': logreg, 
'params': {'penalty': 'l2', 'warm_start': True, 'solver': 'lbfgs', 'max_iter': 500}, 
'scaled': False, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.5616013532562729
very 1012 	 0.6421568627450981
top 5798 	 0.7731607101325684
guy 2033 	 0.8038850814687061
in 7223 	 0.6648422289975389
the 12257 	 0.5882444120655982
middle 3704 	 0.649510631117111
front 1870 	 0.6480566986739826
upper 995 	 0.6662538699690402
right 13293 	 0.8308653470699546

{'wrdl': 'min', 'wprm': 40, 'clsf': logreg, 
'params': {'penalty': 'l1', 'warm_start': True, 'solver': 'liblinear', 'max_iter': 500}, 
'scaled': False, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.5699652288318767
very 1012 	 0.6271654292785075
top 5798 	 0.7765330645786495
guy 2033 	 0.7979394544546816
in 7223 	 0.6607647944752599
the 12257 	 0.5914995194841429
middle 3704 	 0.6650354370570368
front 1870 	 0.6750342935528121
upper 995 	 0.6997856632531555
right 13293 	 0.8350103625386718

{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 
'params': {'penalty': 'l1', 'warm_start': True, 'solver': 'saga', 'max_iter': 500}, 
'scaled': False, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': 'Took ages to run'}
at 1282 	 0.5761206653510008
very 1012 	 0.6339710641538169
top 5798 	 0.7405225211256686
guy 2033 	 0.8115553941814551
in 7223 	 0.6620504720273298
the 12257 	 0.5857023281768299
middle 3704 	 0.612470469119136
front 1870 	 0.6656149977137632
upper 995 	 0.6769707073112646
right 13293 	 0.8256390232180939

{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 'params': {'penalty': 'l2', 'warm_start': True, 'solver': 'lbfgs', 'max_iter': 500}, 'scaled': True, 'nneg': 'balanced', 'nsrc': 'randmax', 'notes': ''}
at 1282 	 0.9315853773141622
very 1012 	 0.942175899486008
top 5798 	 0.8487479649585239
guy 2033 	 0.9266554713384468
in 7223 	 0.7635087977078205
the 12257 	 0.6357069783302849
middle 3704 	 0.8515862301721229
front 1870 	 0.9093735711019661
upper 995 	 0.9451297928078114
right 13293 	 0.8444718108911783

Test (of 100):
	acc			mrr			acv			rnd
0	0.069192	0.563565	0.052249	0.177456

Max_neg=5, and added 'except' & 'other than' to relwords
{'wrdl': 'min', 'wprm': 40, 'clsf': 'logreg', 'params': {'penalty': 'l2', 'warm_start': True, 'solver': 'lbfgs', 'max_iter': 500}, 'scaled': True, 'nneg': 5, 'nsrc': 'randmax', 'notes': ''}
at 1279 	 0.9309648009774896
very 1011 	 0.9386987768311836
top 5794 	 0.8473676048693495
guy 2032 	 0.9252904865649964
in 7220 	 0.7632255694342396
the 12231 	 0.6369023610809469
middle 3703 	 0.8509049487406657
front 1870 	 0.9120713305898491
upper 995 	 0.9424148606811146
right 13291 	 0.8450331921540356

Test (of 50):
	acc			mrr			acv			rnd
0	0.448876	0.425598	0.001242	0.099596
"""

In [12]:
# Get test data
X_ts = filter_X_by_filelist(X, s_splits['test'])
refdf_test = filter_refdf_by_filelist(saiapr_refdf, s_splits['test'])

In [13]:
sys.path.append('../../ApplyModels')
from apply_model import *

wac_dicts = {}
for this_word, npos, n, this_clsf in wacs:
    wac_dicts[this_word] = {'npos': npos, 'n': n, 'clsf': this_clsf}
    
results = eval_testdf(refdf_test, wac_dicts, X_ts[:50])
results

100%|██████████| 12010/12010 [00:00<00:00, 15422.43it/s]


Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp,tagged,cov,suc,rnk,nob
0,0,14576,1,referit,2,seal,"[[seal, NN]]",0.0,False,,
1,0,14576,3,referit,81312,rock left of seal,"[[rock, NN], [left, VBD], [of, IN], [seal, NN]]",0.0,False,,
2,0,14576,2,referit,96551,bottom left corner,"[[bottom, NN], [left, VBD], [corner, NN]]",0.0,False,,
3,0,20909,1,referit,49,kid,"[[kid, NN]]",0.0,False,,
4,0,20909,3,referit,35225,dirt ground,"[[dirt, NN], [ground, NN]]",0.0,False,,
...,...,...,...,...,...,...,...,...,...,...,...
12005,0,40508,1,referit,116219,blue shirt right,"[[blue, JJ], [shirt, NN], [right, NN]]",,,,
12006,0,17169,1,referit,116923,outside the moon,"[[outside, IN], [the, DT], [moon, NN]]",,,,
12007,0,30950,1,referit,118084,water,"[[water, NN]]",0.0,False,,
12008,0,32289,1,referit,118143,group of people at bottom,"[[group, NN], [of, IN], [people, NNS], [at, IN...",,,,


In [14]:
summarise_eval(results)

Unnamed: 0,acc,mrr,acv,rnd
0,0.448876,0.425598,0.001242,0.099596


In [15]:
results.loc[results.suc==True]

Unnamed: 0,i_corpus,image_id,region_id,r_corpus,rex_id,refexp,tagged,cov,suc,rnk,nob
1805,0,63,7,referit,54754,any flag but the first one,"[[any, DT], [flag, NN], [but, CC], [the, DT], ...",0.166667,True,1.0,9.0
6066,0,60,5,referit,11634,car with the back open,"[[car, NN], [with, IN], [the, DT], [back, NN],...",0.2,True,1.0,12.0
6068,0,60,2,referit,18069,the building on the right,"[[the, DT], [building, NN], [on, IN], [the, DT...",0.6,True,1.0,12.0
6071,0,60,8,referit,54604,person middle,"[[person, NN], [middle, NN]]",0.5,True,1.0,12.0
6652,0,45,2,referit,14815,the bluish building to the right,"[[the, DT], [bluish, JJ], [building, NN], [to,...",0.5,True,1.0,4.0


In [10]:
word2den_ts = create_word2den(refdf_test)
X_idx_ts = make_X_id_index(X_ts)
mask_matrix_ts = make_mask_matrix(X_ts, X_idx_ts, word2den_ts, word2den_ts.keys())

In [14]:
# Testing
for this_word, npos, _, this_clsf in wacs:
    X_tst, y_tst = get_X_for_word(X_ts, word2den_ts, mask_matrix_ts, this_word, neg_max='balanced')
    print(this_word, npos, '\t', 
          this_clsf.score(X_tst, y_tst))

at 1282 	 0.525
very 1012 	 0.5630252100840336
top 5798 	 0.7288135593220338
guy 2033 	 0.6478658536585366
in 7223 	 0.5981220657276995
the 12257 	 0.5431841831425598
middle 3704 	 0.5838206627680312
front 1870 	 0.5470085470085471
upper 995 	 0.5560344827586207
right 13293 	 0.814773980154355


In [50]:
wordinfo = [e[:-1] for e in wacs]
type(wordinfo[0][0])


numpy.str_

In [34]:
classf_params = {
        'penalty': 'l2',
        'warm_start': True,
        'solver': 'lbfgs',
        'max_iter': 500
}

# Model description:
model = {
        'rcorp': 'referit',        # ref corpus
        'cnn': 'rsn50-max',        # CNN used for vision feats
        'rel':   'excl',           # exclude relational expressions
        'wrdl':  'min',            # wordlist: minimal n occurrences...
        'wprm':  40,               # ... 40 times
        'clsf':  'logreg-l1',      # logistic regression, l1 regularized
        'params': classf_params,
        'scaled': True,
        'nneg':  'balanced',                # maximum neg instances
        'nsrc':  'randmax',        # ... randomly selected
        'notes': ''
}
    


In [51]:
with open('test.json', 'w') as f:
    json.dump((model, wordinfo), f)