# Museums in the Pandemic - Extract indicators

**Authors**: Andrea Ballatore (KCL)

**Abstract**: Extract indicators from museum text.

## Setup
This is to check that your environment is set up correctly (it should print 'env ok', ignore warnings).

In [27]:
# Test geospatial libraries
# check environment
import os
print("Conda env:", os.environ['CONDA_DEFAULT_ENV'])
if os.environ['CONDA_DEFAULT_ENV'] != 'mip_v1':
    raise Exception("Set the environment 'mip_v1' on Anaconda. Current environment: " + os.environ['CONDA_DEFAULT_ENV'])

# spatial libraries 
import pandas as pd
import pickle
import spacy
from termcolor import colored
import sys
import numpy as np
from numpy import arange
#import tensorflow as tf
from bs4 import BeautifulSoup
from bs4.element import Comment
#import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# import from `mip` project
print(os.getcwd())
fpath = os.path.abspath('../')
if not fpath in sys.path:
    sys.path.insert(0, fpath)

out_folder = '../../'

from museums import *
from utils import _is_number
from analytics.text_models import derive_new_attributes_matches, get_all_matches_from_db

print('env ok')

Conda env: mip_v1
/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/museums-in-the-pandemic/mip/notebooks
env ok


## Connect to DB

It needs the DCS VPN active to work.

In [28]:
# open connection to DB
from db.db import connect_to_postgresql_db

db_conn = connect_to_postgresql_db()
print("DB connected")

DB connected


## Extract matches for all museums

Using the best deep learning model defined above, find indicators for all museums (from websites and social media).

### Load deep learning validation model

In [25]:
from keras.models import load_model
from sklearn.preprocessing import MinMaxScaler

# MODEL COLUMNS
cols_fn = out_folder+"data/analysis/matching_validation/matching_validation_deep_learning_model_columns.csv"
valid_model_columns = pd.read_csv(cols_fn).iloc[:, 0].tolist()

def prep_match_data(df):
    df = df[valid_model_columns]
    assert len(df.columns) == 33, len(df.columns)
    num_df = df.select_dtypes(include=[np.number])
    scaler = MinMaxScaler()
    # fit and transform in one step
    cols = num_df.columns
    x_data = pd.DataFrame(scaler.fit_transform(num_df),columns=cols)
    return x_data

def convert_pred_to_bool(vals):
    pred_y = (vals > 0.5).astype("bool")
    # unpack results
    bool_vals = [item for sublist in pred_y for item in sublist]
    return bool_vals

valid_ann_df_fn = 'matches_valid_ann_df_v3.pik'
valid_ann_df = pd.read_pickle(out_folder+'data/annotations/'+valid_ann_df_fn)

valid_match_cnn_model = load_model(out_folder+"data/analysis/matching_validation/matching_validation_deep_learning_model.h5")
valid_match_cnn_model

x_data = prep_match_data(valid_ann_df)
assert len(x_data.columns) == 33, len(x_data.columns)
print(x_data)
pred_valid = convert_pred_to_bool(valid_match_cnn_model.predict(x_data))

valid_ann_df['predicted_valid'] = pred_valid

#valid_ann_df.to_excel(out_folder+"tmp/check_deeplearning.xlsx",index=False)
valid_ann_df.sample(10)


     sem_similarity   token_n   lemma_n  ann_overlap_lemma  ann_overlap_token  \
0          0.659748  0.285714  0.166667           1.000000            1.00000   
1          0.703680  0.428571  0.333333           0.733333            0.75000   
2          0.759582  0.000000  0.000000           0.146667            0.00000   
3          0.726219  0.571429  0.500000           0.644448            0.66667   
4          0.667261  0.000000  0.000000           0.200000            0.00000   
..              ...       ...       ...                ...                ...   
695        0.830129  0.428571  0.333333           0.573333            0.60000   
696        0.712849  0.142857  0.166667           1.000000            0.50000   
697        0.398828  0.000000  0.000000           0.200000            0.00000   
698        0.647905  0.000000  0.000000           0.200000            0.00000   
699        0.661403  0.142857  0.000000           0.288885            0.33333   

     example_len  txt_overl

Unnamed: 0,muse_id,page_id,sentence_id,example_id,indicator_code,session_id,ann_ex_tokens,page_tokens,sem_similarity,token_n,...,indicator_code_project_postpone,indicator_code_reopen_intent,indicator_code_reopen_plan,indicator_code_staff_hiring,indicator_code_staff_restruct,indicator_code_staff_working,overlap_bin,valid_match,valid_match_b,predicted_valid
171,mm.domus.SC255,762062,mus_page762062_sent00030,ann_ex_00160,lang_difficulty,20210304,we move back what hope will stable new normal ...,please keep checking back updates once again t...,0.8752,2,...,0,0,0,0,0,0,"(0.0, 0.45]",T,True,False
169,mm.musa.392,116010,mus_page116010_sent00162,ann_ex_00036,closed_cur,20210304,line government restrictions remain closed now,please note interiors currently closed line go...,0.8649,3,...,0,0,0,0,0,0,"(0.45, 1.01]",T,True,True
673,mm.wiki.076,540396,mus_page540396_sent00020,ann_ex_00237,reopen_intent,20210304,we look forward seeing back museum,we look forward seeing you,0.9385,4,...,0,1,0,0,0,0,"(0.45, 1.01]",T,True,True
646,mm.domus.SC127,19970,mus_page19970_sent00008,ann_ex_00191,online_exhib,20210304,online gallery,museums explore our museums exhibitions online...,0.6228,1,...,0,0,0,0,0,0,"(0.45, 1.01]",T,True,False
520,mm.ace.1089,53136,mus_page53136_sent00133,ann_ex_00246,reopen_intent,20210304,look forward welcoming you again soon,place runs gift aid entry we 'll offer you cle...,0.7377,1,...,0,1,0,0,0,0,"(0.0, 0.45]",F,False,False
626,mm.domus.SE166,624306,mus_page624306_sent00015,ann_ex_00229,reopen_intent,20210304,when we open season,when we able to re open again we shall update ...,0.8202,3,...,0,1,0,0,0,0,"(0.45, 1.01]",T,True,True
664,mm.domus.NE048,811090,mus_page811090_sent00016,ann_ex_00204,open_onlineshop,20210304,our online shop remains open,our online shop now open,0.9688,4,...,0,0,0,0,0,0,"(0.45, 1.01]",T,True,True
111,mm.domus.SC194,784891,mus_page784891_sent00067,ann_ex_00189,online_exhib,20210304,online exhibition,what museum take tour exhibitions,0.7138,0,...,0,0,0,0,0,0,"(0.45, 1.01]",F,False,False
299,mm.domus.SE521,118643,mus_page118643_sent00073,ann_ex_00149,lang_difficulty,20210304,critical time,fenton house garden opening times,0.4696,0,...,0,0,0,0,0,0,"(0.45, 1.01]",F,False,False
695,mm.hha.121,760091,mus_page760091_sent00131,ann_ex_00002,closed_cur,20210304,closed members public further notice,house remains closed further notice,0.8577,3,...,0,0,0,0,0,0,"(0.45, 1.01]",T,True,True


### Load all matches from DB

- Dump all matches from DB, after running `an_text` on ALL museums for a given crawling session.

In [None]:
# DB columns:
""" 
example_id indicator_code lemma_n lemma_n_wdupl token_n token_n_wdupl criticalwords_n criticalwords_n_wdupl sentence_id  sent_len
example_len example_crit_len ann_overlap_lemma ann_overlap_token ann_overlap_criticwords txt_overlap_lemma
txt_overlap_token ann_ex_tokens ann_ex_tokens page_tokens session_id page_id muse_id keep_stopwords
"""

# load from DB - SLOW
sessions = ['20210304','20210404']
for session_id in sessions:
    get_all_matches_from_db(session_id, db_conn, out_folder)

get_all_matches_from_db 20210304
query results: (1136440, 17)
	saved ../../tmp/matches_dump_df_20210304.pik
get_all_matches_from_db 20210404


### Predict all matches

In [9]:
sessions = ['20210304'] # ,'20210404'

def select_valid_matches(df, model):
    """ use Deep Learning model to validate matches """
    x_data = prep_match_data(df)
    print('select_valid_matches', x_data.shape)
    # check column order
    assert valid_model_columns == x_data.columns.tolist()
    print(x_data.shape)
    # apply model for predictions
    valid_int = model.predict(x_data)
    pred_valid = convert_pred_to_bool(valid_int)
    #print(type(pred_valid),len(pred_valid))
    df['valid_match'] = pred_valid
    print(df.valid_match.value_counts())
    return df


for session_id in sessions:
    print('> session_id',session_id)
    matches_fn = out_folder+'tmp/matches_dump_df_{}.pik'.format(session_id)
    matchdf = pd.read_pickle(matches_fn)
    print("\t", matches_fn, matchdf.shape)
    # apply model to get valid matches
    validmatch_df = select_valid_matches(matchdf, valid_match_cnn_model)
    # save sample to inspect results
    validmatch_df.sample(110).to_csv(out_folder+'tmp/valid_matches_sample_{}.tsv'.format(session_id),sep='\t')
    

> session_id 20210304
	 ../../tmp/matches_dump_df_20210304.pik (1159487, 40)
select_valid_matches (1159487, 30)
(1159487, 30)
False    1057757
True      101730
Name: valid_match, dtype: int64


In [None]:
### extract stable results
681/9319

End of notebook