# Museums in the Pandemic - Extract indicators

**Authors**: Andrea Ballatore (KCL)

**Abstract**: Extract indicators from museum text.

## Setup
This is to check that your environment is set up correctly (it should print 'env ok', ignore warnings).

In [14]:
# Test geospatial libraries
# check environment
import os
print("Conda env:", os.environ['CONDA_DEFAULT_ENV'])
if os.environ['CONDA_DEFAULT_ENV'] != 'mip_v1':
    raise Exception("Set the environment 'mip_v1' on Anaconda. Current environment: " + os.environ['CONDA_DEFAULT_ENV'])

# spatial libraries 
import pandas as pd
import pickle
import spacy
from termcolor import colored
import sys
import numpy as np
from numpy import arange
#import tensorflow as tf
from bs4 import BeautifulSoup
from bs4.element import Comment
#import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# import from `mip` project
print(os.getcwd())
fpath = os.path.abspath('../')
if not fpath in sys.path:
    sys.path.insert(0, fpath)

out_folder = '../../'

from museums import *
from utils import _is_number
from analytics.text_models import derive_new_attributes_matches, get_all_matches_from_db

print('env ok')

Conda env: mip_v1
/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/museums-in-the-pandemic/mip/notebooks
env ok


## Connect to DB

It needs the DCS VPN active to work.

In [3]:
# open connection to DB
from db.db import connect_to_postgresql_db

db_conn = connect_to_postgresql_db()
print("DB connected")

DB connected


## Extract matches for all museums

Using the best deep learning model defined above, find indicators for all museums (from websites and social media).

### Load deep learning validation model

In [5]:
from keras.models import load_model
from sklearn.preprocessing import MinMaxScaler

# MODEL COLUMNS
valid_model_columns = ['token_n', 'lemma_n', 'ann_overlap_lemma', 'ann_overlap_token',
       'example_len', 'txt_overlap_lemma', 'txt_overlap_token',
       'ann_overlap_criticwords', 'lemmatoken_n', 'ann_overlap_tokenlemma',
       'txt_overlap_tokenlemma', 'sem_similarity',
       'indicator_code_closed_indef',
       'indicator_code_closed_perm', 'indicator_code_finance_health',
       'indicator_code_funding_did_not_get',
       'indicator_code_funding_fundraise', 'indicator_code_funding_gov_emer',
       'indicator_code_funding_other_emer', 'indicator_code_lang_difficulty',
       'indicator_code_online_engag', 'indicator_code_online_event',
       'indicator_code_online_exhib', 'indicator_code_open_cafe',
       'indicator_code_open_cur', 'indicator_code_open_onlineshop',
       'indicator_code_project_postpone', 'indicator_code_reopen_intent',
       'indicator_code_reopen_plan', 'indicator_code_staff_hiring',
       'indicator_code_staff_working']

def prep_match_data(df):
    df = df[valid_model_columns]
    assert len(df.columns) == 30
    num_df = df.select_dtypes(include=[np.number])
    scaler = MinMaxScaler()
    # fit and transform in one step
    cols = num_df.columns
    x_data = pd.DataFrame(scaler.fit_transform(num_df),columns=cols)
    return x_data

def convert_pred_to_bool(vals):
    pred_y = (vals > 0.5).astype("bool")
    # unpack results
    bool_vals = [item for sublist in pred_y for item in sublist]
    return bool_vals

valid_ann_df_fn = 'matches_valid_ann_df_v2.pik'
valid_ann_df = pd.read_pickle(out_folder+'data/annotations/'+valid_ann_df_fn)

valid_match_cnn_model = load_model(out_folder+"data/analysis/matching_validation/matching_validation_deep_learning_model.h5")
valid_match_cnn_model

x_data = prep_match_data(valid_ann_df)
assert len(x_data.columns) == 30
print(x_data)
pred_valid = convert_pred_to_bool(valid_match_cnn_model.predict(x_data))

valid_ann_df['predicted_valid'] = pred_valid

#valid_ann_df.to_excel(out_folder+"tmp/check_deeplearning.xlsx",index=False)
valid_ann_df.sample(5)


2021-10-18 14:35:56.328865: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-18 14:35:56.452092: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


     token_n  lemma_n  ann_overlap_lemma  ann_overlap_token  example_len  \
0      0.000    0.125            0.50000            0.00000     0.000000   
1      0.250    0.250            0.66667            0.66667     0.066667   
2      0.125    0.125            0.16667            0.16667     0.266667   
3      0.250    0.250            0.66667            0.66667     0.066667   
4      0.125    0.125            0.33333            0.33333     0.066667   
..       ...      ...                ...                ...          ...   
695    0.125    0.125            0.33333            0.33333     0.066667   
696    0.125    0.125            0.12500            0.12500     0.400000   
697    0.250    0.250            0.50000            0.50000     0.133333   
698    0.375    0.375            0.60000            0.60000     0.200000   
699    0.250    0.250            1.00000            1.00000     0.000000   

     txt_overlap_lemma  txt_overlap_token  ann_overlap_criticwords  \
0              0.

Unnamed: 0,muse_id,sentence_id,page_tokens,indicator_code,ann_ex_tokens,token_n,lemma_n,ann_overlap_lemma,ann_overlap_token,example_len,...,indicator_code_online_exhib,indicator_code_open_cafe,indicator_code_open_cur,indicator_code_open_onlineshop,indicator_code_project_postpone,indicator_code_reopen_intent,indicator_code_reopen_plan,indicator_code_staff_hiring,indicator_code_staff_working,predicted_valid
648,mm.aim.0254,mus_page263632_sent00015,what to see darwen darwen heritage centre comm...,lang_difficulty,extremely short money given what must done,1,1,0.14286,0.14286,7,...,0,0,0,0,0,0,0,0,0,False
46,mm.domus.SE137,mus_page423785_sent00034,charles burrell museum pleased to announce we ...,closed_cur,we ’re closed,1,1,0.33333,0.33333,3,...,0,0,0,0,0,0,0,0,0,False
351,mm.mgs.281,mus_page20009_sent00029,scotland ’s largest charities glasgow life hug...,funding_fundraise,please support us time donation,2,3,0.6,0.4,5,...,0,0,0,0,0,0,0,0,0,True
447,mm.misc.015,mus_page563788_sent00082,we ’re good,finance_health,we now exceeded original target,1,1,0.2,0.2,5,...,0,0,0,0,0,0,0,0,0,False
119,mm.New.66,mus_page420402_sent00028,follow visitor trail cross historic footbridge...,funding_fundraise,we urgently need to raise,1,1,0.2,0.2,5,...,0,0,0,0,0,0,0,0,0,False


### Load all matches from DB

- Dump all matches from DB, after running `an_text` on ALL museums for a given crawling session.

In [7]:
# DB columns:
""" 
example_id indicator_code lemma_n lemma_n_wdupl token_n token_n_wdupl criticalwords_n criticalwords_n_wdupl sentence_id  sent_len
example_len example_crit_len ann_overlap_lemma ann_overlap_token ann_overlap_criticwords txt_overlap_lemma
txt_overlap_token ann_ex_tokens ann_ex_tokens page_tokens session_id page_id muse_id keep_stopwords
"""

# load from DB - SLOW

sessions = ['20210304','20210404']
for session_id in sessions:
    get_all_matches_from_db(session_id)

get_all_matches_from_db 20210304
query results: (1159487, 16)
(700, 37)
	saved ../../tmp/matches_dump_df_20210304.pik
get_all_matches_from_db 20210404
query results: (860359, 16)
(700, 37)
	saved ../../tmp/matches_dump_df_20210404.pik


### Predict all matches

In [9]:
sessions = ['20210304'] # ,'20210404'


def select_valid_matches(df, model):
    """ use Deep Learning model to validate matches """
    x_data = prep_match_data(df)
    print('select_valid_matches', x_data.shape)
    # check column order
    assert valid_model_columns == x_data.columns.tolist()
    print(x_data.shape)
    # apply model for predictions
    valid_int = model.predict(x_data)
    pred_valid = convert_pred_to_bool(valid_int)
    #print(type(pred_valid),len(pred_valid))
    df['valid_match'] = pred_valid
    print(df.valid_match.value_counts())
    return df


for session_id in sessions:
    print('> session_id',session_id)
    matches_fn = out_folder+'tmp/matches_dump_df_{}.pik'.format(session_id)
    matchdf = pd.read_pickle(matches_fn)
    print("\t", matches_fn, matchdf.shape)
    # apply model to get valid matches
    validmatch_df = select_valid_matches(matchdf, valid_match_cnn_model)
    # save sample to inspect results
    validmatch_df.sample(110).to_csv(out_folder+'tmp/valid_matches_sample_{}.tsv'.format(session_id),sep='\t')
    

> session_id 20210304
	 ../../tmp/matches_dump_df_20210304.pik (1159487, 40)
select_valid_matches (1159487, 30)
(1159487, 30)
False    1057757
True      101730
Name: valid_match, dtype: int64


In [None]:
### extract stable results
681/9319

End of notebook