# Museums in the Pandemic - Social media analytics

**Authors**: Andrea Ballatore (KCL)

**Abstract**: Analysis of social media data

## Setup
This is to check that your environment is set up correctly (it should print 'env ok', ignore warnings).

In [1]:
# Test geospatial libraries
# check environment
import os
print("Conda env:", os.environ['CONDA_DEFAULT_ENV'])
if os.environ['CONDA_DEFAULT_ENV'] != 'mip_v1':
    raise Exception("Set the environment 'mip_v1' on Anaconda. Current environment: " + os.environ['CONDA_DEFAULT_ENV'])

# spatial libraries 
import pandas as pd
import pickle
import spacy
import itertools
import re
from termcolor import colored
import matplotlib.dates as mdates
import sys
from matplotlib.colors import ListedColormap
import numpy as np
import calplot
from numpy import arange
#import tensorflow as tf
from bs4 import BeautifulSoup
from bs4.element import Comment
#import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

# import from `mip` project
print(os.getcwd())
fpath = os.path.abspath('../')
if not fpath in sys.path:
    sys.path.insert(0, fpath)

out_folder = '../../'

from museums import *
from utils import _is_number
from analytics.text_models import derive_new_attributes_matches, get_all_matches_from_db, get_indicator_annotations
from museums import get_museums_w_web_urls, get_twitter_facebook_links_v2, get_extra_museum_attributes

cur_folder = out_folder + 'data/analysis/social_media_analytics/'
if not os.path.exists(cur_folder):
    os.makedirs(cur_folder)
    
print('env ok')

Conda env: mip_v1
/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/museums-in-the-pandemic/mip/notebooks_py
env ok


## Connect to DB

It needs the DCS VPN active to work.

In [15]:
# open connection to DB
from db.db import connect_to_postgresql_db

db_conn = connect_to_postgresql_db()
print("DB connected")

DB connected


## Load museum info


In [3]:
# load museums
df = get_museums_w_web_urls(out_folder)
print("museums url N:",len(df))
attr_df = load_input_museums_wattributes(out_folder)
df = pd.merge(df, attr_df, on='muse_id', how='left')
print("museum df with attributes: len", len(df))
mus_df = df.rename(columns={'muse_id':'museum_id','musname_x':'musname'})
del df
print(len(mus_df), mus_df.columns)

links_df = get_twitter_facebook_links_v2(out_folder)[['museum_id', 'twitter_id', 'facebook_pages']]
mus_df = mus_df.merge(links_df, on='museum_id', how='left')
mus_df = get_extra_museum_attributes(mus_df)
del links_df
#print(len(mus_df), mus_df.columns)
mus_df.sample(50)

museums urls: ../../data/museums/museum_websites_urls-v3.tsv
nationaltrust.org.uk       179
english-heritage.org.uk     52
no_resource.                33
visitscotland.com           24
nts.org.uk                  21
                          ... 
glynnvivian.co.uk            1
glynde.co.uk                 1
gwsr.com                     1
gloucesterquays.co.uk        1
smithsonfarm.co.uk           1
Name: domain, Length: 2441, dtype: int64
get_museums_w_web_urls Museums=3344 URLs=3344
museums url N: 3344
Index(['musname', 'muse_id', 'town', 'postcode', 'accreditation', 'governance',
       'size', 'subject_matter', 'closing_date', 'provenance',
       'deprivation_index', 'geodemographic_group', 'geodemographic_subgroup',
       'admin_area'],
      dtype='object')
loaded museums w attributes (open): 3341 data/museums/museums_wattributes-2020-02-23.tsv
museum df with attributes: len 3344
3344 Index(['museum_id', 'musname', 'town_x', 'url', 'url_source', 'Unnamed: 5',
       'domain', 'm

  df['region'] = df['region'].str.replace('\(English Region\)','').str.strip()


Unnamed: 0,museum_id,musname,town_x,url,url_source,Unnamed: 5,domain,musname_y,town_y,postcode,...,deprivation_index,geodemographic_group,geodemographic_subgroup,admin_area,twitter_id,facebook_pages,governance_simpl,subject_matter_simpl,country,region
1756,mm.domus.WA004,Monmouth Museum,Monmouth,https://www.visitmonmouthshire.com/Monmouth-Ne...,pred,,visitmonmouthshire.com,Monmouth Museum,Monmouth,NP25 3XA,...,9.0,English and Welsh Countryside,Sparse English and Welsh Countryside,/Wales/Monmouthshire (Welsh UA),"['twitter.com/visit_mon', 'twitter.com/monmout...",['www.facebook.com/visitmonmouthshire'],government,Personality,Wales,Wales
3233,mm.domus.YH092,Withernsea Lighthouse Museum,Withernsea,https://www.visiteastyorkshire.co.uk/things-to...,pred,,visiteastyorkshire.co.uk,Withernsea Lighthouse Museum,Withernsea,HU19 2DY,...,5.0,English and Welsh Countryside,Sparse English and Welsh Countryside,/England/Yorkshire and The Humber (English Reg...,"['twitter.com/visiteastyorks', 'twitter.com/ws...","['www.facebook.com/withernsealighthouse', 'www...",independent,Sea and seafaring,England,Yorkshire and The Humber
1786,mm.domus.EM107,Mrs Smiths Cottage,Navenby,https://www.mrssmithscottage.com/,pred,,mrssmithscottage.com,Mrs Smiths Cottage,Navenby,LN5 0EP,...,9.0,English and Welsh Countryside,Sparse English and Welsh Countryside,/England/East Midlands (English Region)/Lincol...,twitter.com/mrssmithcottage,['www.facebook.com/heartoflincs'],government,Local Histories,England,East Midlands
1979,mm.domus.WA053,Newport Museum And Art Gallery,Newport,http://www.newport.gov.uk/heritage/en/Museum-A...,pred,,newport.gov.uk,Newport Museum And Art Gallery,Newport,NP20 1PA,...,1.0,Manufacturing Traits,Industrial and Multi-ethnic,/Wales/Newport (Welsh UA),twitter.com/newportmuseum,en-gb.facebook.com/pages/category/Nonprofit-Or...,government,Local Histories,Wales,Wales
2771,mm.domus.SE506,The Estorick Collection Of Modern Italian Art,London,https://www.estorickcollection.com/,pred,,estorickcollection.com,The Estorick Collection Of Modern Italian Art,London,N1 2AN,...,5.0,London Cosmopolitan,London Cosmopolitan,/England/London (English Region)/Islington (Lo...,twitter.com/estorick,"['www.facebook.com/estorickcollection', 'en-gb...",independent,Arts,England,London
1503,mm.domus.SC155,Largs Museum,Largs,http://www.largsonline.co.uk/museum.html,pred,,largsonline.co.uk,Largs Museum,Largs,KA30 8AW,...,2.0,Scottish Industrial Heritage,Scottish Industrial Legacy,/Scotland/North Ayrshire (Scottish Council Area),twitter.com/largsmuseum,no_resource,independent,Local Histories,Scotland,Scotland
2389,mm.domus.SW129,Salcombe Maritime Museum,Salcombe,http://www.salcombemuseum.org.uk/,pred,,salcombemuseum.org.uk,Salcombe Maritime Museum,Salcombe,TQ8 8DQ,...,9.0,English and Welsh Countryside,Older Farming Communities,/England/South West (English Region)/Devon (En...,no_resource,"['www.facebook.com/salcombemaritimemuseum', 'e...",independent,Sea and seafaring,England,South West
2817,mm.ace.1198,The Lightbox,Woking,https://www.thelightbox.org.uk/,pred,,thelightbox.org.uk,The Lightbox,Woking,GU21 4AA,...,9.0,Rural-Urban Fringe,Rural-Urban Fringe,/England/South East (English Region)/Surrey (E...,twitter.com/thelightbox,"['www.facebook.com/thelightboxwoking', 'en-gb....",independent,Local Histories,England,South East
1098,mm.domus.SE295,Gosport Discovery Centre,Gosport,https://www.hants.gov.uk/librariesandarchives/...,pred,,hants.gov.uk,Gosport Discovery Centre,Gosport,PO12 1NQ,...,2.0,Services Manufacturing and Mining Legacy,Affluent rural,/England/South East (English Region)/Hampshire...,twitter.com/gosportdc,"['www.facebook.com/gosportdiscoverycentre', 'e...",government,Local Histories,England,South East
1863,mm.domus.NW005,Museum Of The Manchester Regiment,Ashton-under-Lyne,https://www.tameside.gov.uk/LibrariesandLeisur...,pred,,tameside.gov.uk,Museum Of The Manchester Regiment,Ashton-under-Lyne,OL6 6DL,...,1.0,Manufacturing Traits,Urban Living,/England/North West (English Region)/Greater M...,no_resource,no_resource,independent,War and conflict,England,North West


## Temporal analysis of Twitter/Facebook

Temporal analysis based on message counts.

### Manual corrections of top decile

The largest museums tend to have less accurate accounts, so we fixed them manually. The rest is ok (see validation results below). `msg_count_by_museum_manual_validation-v2.xlsx` was annotated by Mark L and Jamie L.

Examples of corrections:
- mm.misc.139: drop twitter.com/atlasobscura, drop www.facebook.com/atlasobscura
- Twitter: mm.domus.YH123 update https://twitter.com/WFMuseums (drop 'twitter.com/mywakefield', 'twitter.com/pontecastle')
- Facebook: mm.domus.SC121 update http://www.facebook.com/guernseymuseums (drop ['www.facebook.com/grantownmuseum', 'en-gb.facebook.com/grantownmuseum/posts'])

In [10]:
corrections_df = pd.read_excel(out_folder+'data/annotations/msg_count_by_museum_manual_validation-v2.xlsx')
corrections_df

# drop_twitter	drop_facebook	correct_twitter	correct_facebook	drop_website	correct_website

corrections_df['b_valid_twitter'] = corrections_df.drop_twitter.isnull()
corrections_df['b_valid_facebook'] = corrections_df.drop_facebook.isnull()
corrections_df['b_new_twitter'] = ~corrections_df.correct_twitter.isnull()
corrections_df['b_new_facebook'] = ~corrections_df.correct_facebook.isnull()

corrections_df = corrections_df[['museum_id','drop_twitter','b_valid_twitter','drop_facebook','b_valid_facebook','correct_twitter','correct_facebook','b_new_twitter','b_new_facebook','correct_website']]
print(corrections_df.b_valid_twitter.value_counts())
print(corrections_df.b_new_twitter.value_counts())
print(corrections_df.b_valid_facebook.value_counts())
print(corrections_df.b_new_facebook.value_counts())

corrections_df['facebook_action'] = 'keep'
corrections_df.loc[(~corrections_df.b_valid_facebook & ~corrections_df.b_new_facebook) , 'facebook_action'] = 'drop'
corrections_df.loc[(corrections_df.b_new_facebook), 'facebook_action'] = 'update'
print(corrections_df.facebook_action.value_counts())

corrections_df['twitter_action'] = 'keep'
corrections_df.loc[(~corrections_df.b_valid_twitter & ~corrections_df.b_new_twitter) , 'twitter_action'] = 'drop'
corrections_df.loc[(corrections_df.b_new_twitter), 'twitter_action'] = 'update'
print(corrections_df.twitter_action.value_counts())

corrections_df[['museum_id','twitter_action','facebook_action','correct_twitter','correct_facebook']].to_csv(
    out_folder+'data/museums/social_media_url_corrections.tsv', sep = '\t', index=False)


True     3201
False     143
Name: b_valid_twitter, dtype: int64
False    3313
True       31
Name: b_new_twitter, dtype: int64
True     3206
False     138
Name: b_valid_facebook, dtype: int64
False    3304
True       40
Name: b_new_facebook, dtype: int64
keep      3169
drop       135
update      40
Name: facebook_action, dtype: int64
keep      3172
drop       141
update      31
Name: twitter_action, dtype: int64


In [11]:
# apply corrections
corrections_df = pd.read_csv(out_folder+'data/museums/social_media_url_corrections.tsv', sep = '\t')
print(corrections_df.columns, len(corrections_df))
assert corrections_df.museum_id.is_unique

mus_df = get_twitter_facebook_links(out_folder)[['museum_id', 'twitter_id', 'facebook_pages']]
print(mus_df.columns)
corrected_mus_df = mus_df.copy().merge(corrections_df, on='museum_id')
print(corrected_mus_df.columns)

# fix twitter
ids = corrections_df.loc[corrections_df.twitter_action == 'drop', 'museum_id']
print(len(ids))
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'twitter_id_old'] = corrected_mus_df['twitter_id']
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'twitter_id'] = 'no_resource'
ids = corrections_df.loc[corrections_df.twitter_action == 'update', 'museum_id']
corrected_mus_df.loc[corrected_mus_df.twitter_action=='update', 'twitter_id_old'] = corrected_mus_df['twitter_id']
corrected_mus_df.loc[corrected_mus_df.twitter_action=='update', 'twitter_id'] = corrected_mus_df['correct_twitter']

# fix facebook
ids = corrections_df.loc[corrections_df.facebook_action == 'drop', 'museum_id']
print(len(ids))
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'facebook_pages_old'] = corrected_mus_df['facebook_pages']
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'facebook_pages'] = 'no_resource'
ids = corrections_df.loc[corrections_df.facebook_action == 'update', 'museum_id']
corrected_mus_df.loc[corrected_mus_df.facebook_action=='update', 'facebook_pages_old'] = corrected_mus_df['facebook_pages']
corrected_mus_df.loc[corrected_mus_df.facebook_action=='update', 'facebook_pages'] = corrected_mus_df['correct_facebook']

print('> twitter')
print(mus_df.twitter_id.value_counts())
print(corrected_mus_df.twitter_id.value_counts())
print('> facebook')
print(mus_df.facebook_pages.value_counts())
print(corrected_mus_df.facebook_pages.value_counts())

corrected_mus_df.to_csv(out_folder+'data/museums/social_media_urls_corrected.tsv', sep='\t', index=False)


Index(['museum_id', 'twitter_action', 'facebook_action', 'correct_twitter',
       'correct_facebook'],
      dtype='object') 3344
loaded museums: 3344 ../../data/museums/museum_names_and_postcodes-2020-01-26.tsv
get_twitter_facebook_links N = 3344
Index(['museum_id', 'twitter_id', 'facebook_pages'], dtype='object')
Index(['museum_id', 'twitter_id', 'facebook_pages', 'twitter_action',
       'facebook_action', 'correct_twitter', 'correct_facebook'],
      dtype='object')
141
135
> twitter
no_resource                    993
twitter.com/lancastermuseum      4
twitter.com/riponmuseums         3
twitter.com/chelmsmuseum         3
twitter.com/museumoflondon       3
                              ... 
twitter.com/gowerheritage        1
twitter.com/gosportdc            1
twitter.com/gosfordhouse         1
twitter.com/grussellmuseum       1
twitter.com/zetlandmuseum        1
Name: twitter_id, Length: 2270, dtype: int64
no_resource                       1118
twitter.com/lancastermuseum          

### Download Twitter/Facebook data from DB (slow)

Extracting only message counts, not content.

In [13]:
# file paths
tw_fn = out_folder+'tmp/tweets.pik'
fb_fn = out_folder+'tmp/facebook_posts.pik'

In [16]:
if True:
    max_time = '2021-10-01'
    
    sql = "select muse_id as museum_id, account, tw_ts as ts from twitter.tweets_dump td where tw_ts <= date('{}');".format(max_time) # limit 10000
    df = pd.read_sql(sql, db_conn)
    df['ts'] = pd.to_datetime(df['ts'])
    print(df.columns)
    df = df.set_index('ts')
    df.to_pickle(tw_fn)
    print(len(df))
    tw_fn

    sql = "select museum_id, post_ts as ts from facebook.facebook_posts_dump where post_ts <= date('{}');".format(max_time) # limit 10000
    df = pd.read_sql(sql, db_conn)
    df['ts'] = pd.to_datetime(df['ts'])
    print(df.columns)
    df = df.set_index('ts')
    df.to_pickle(fb_fn)
    print(len(df))
    fb_fn

Index(['museum_id', 'account', 'ts'], dtype='object')
5481882
Index(['museum_id', 'ts'], dtype='object')
1187529


### Activity by museum

In [17]:
def messages_by_museum(df, label):
    print('messages_by_museum', label, len(df))
    mdf = df.groupby('museum_id').size()
    field = 'msg_count_'+label
    mdf = mdf.reset_index(name=field)
    mdf[field+'_decile'] = pd.qcut(-mdf[field], 10, labels=False)+1
    mdf[field+'_z'] = round((mdf[field] - mdf[field].mean())/mdf[field].std(),3)
    print(mdf.columns)
    return mdf
    
# generate tweet plots
df = pd.read_pickle(tw_fn)
act_df = messages_by_museum(df, 'twitter')

df = pd.read_pickle(fb_fn)
act_df2 = messages_by_museum(df, 'facebook')
act_df = act_df.merge(act_df2, on='museum_id', how='outer')

# merge with all museums
act_df = act_df.merge(mus_df, on='museum_id', how='outer')
act_df['msg_count_facebook'] = act_df['msg_count_facebook'].fillna(0)
act_df['msg_count_twitter'] = act_df['msg_count_twitter'].fillna(0)

fout = cur_folder + 'msg_count_by_museum.xlsx'
act_df.to_excel(fout, index=False)
fout

messages_by_museum twitter 5481882
Index(['museum_id', 'msg_count_twitter', 'msg_count_twitter_decile',
       'msg_count_twitter_z'],
      dtype='object')
messages_by_museum facebook 1187529
Index(['museum_id', 'msg_count_facebook', 'msg_count_facebook_decile',
       'msg_count_facebook_z'],
      dtype='object')


'../../data/analysis/social_media_analytics/msg_count_by_museum.xlsx'

#### Sample for Tw/Fb validation

Validation carried out on 2021 Nov 24.
Stratified by decile to validate the low deciles.

In [15]:
# msg_count_twitter_decile | msg_count_facebook_decile
dfs = []
for dec_var in ['msg_count_twitter_decile','msg_count_facebook_decile']:
    for dec, subdf in act_df.groupby(dec_var):
        if dec < 3: continue
        print(dec, len(subdf))
        smpl_df = subdf.sample(10, random_state=10)
        smpl_df['sample'] = "{}_sample_{}".format(dec_var,dec)
        dfs.append(smpl_df)
decile_sample_df = pd.concat(dfs)
print(decile_sample_df.columns)
del dfs, subdf

print("N =",len(decile_sample_df))
decile_sample_df = decile_sample_df[['museum_id','musname','town_x','url','twitter_id','facebook_pages',
                  'msg_count_twitter','msg_count_facebook','sample']]
decile_sample_df['correct'] = ''
decile_sample_df.to_excel(out_folder+'tmp/museum_social_media_sample_by_decile.xlsx',index=False)

3.0 222
4.0 222
5.0 222
6.0 221
7.0 225
8.0 221
9.0 226
10.0 216
3.0 121
4.0 119
5.0 121
6.0 122
7.0 120
8.0 120
9.0 121
10.0 121
Index(['museum_id', 'msg_count_twitter', 'msg_count_twitter_decile',
       'msg_count_twitter_z', 'msg_count_facebook',
       'msg_count_facebook_decile', 'msg_count_facebook_z', 'musname',
       'town_x', 'url', 'url_source', 'Unnamed: 5', 'domain', 'musname_y',
       'town_y', 'postcode', 'accreditation', 'governance', 'size',
       'subject_matter', 'closing_date', 'provenance', 'deprivation_index',
       'geodemographic_group', 'geodemographic_subgroup', 'admin_area',
       'twitter_id', 'facebook_pages', 'governance_simpl',
       'subject_matter_simpl', 'country', 'region', 'sample'],
      dtype='object')
N = 160


#### Validation results
Annotated manually by Mark L and Jamie L.

In [16]:
sample_df = pd.read_excel(out_folder+'data/annotations/museum_social_media_sample_by_decile-v2.xlsx')
print(sample_df.correct.value_counts())
sample_df['b_valid'] = ~sample_df.correct.isin(['N','M'])
sample_df.sample(10)

N    13
M     6
Name: correct, dtype: int64


Unnamed: 0,museum_id,musname,town_x,url,twitter_id,facebook_pages,msg_count_twitter,msg_count_facebook,sample,correct,b_valid
144,mm.domus.SW219,Montacute House,Montacute,https://www.nationaltrust.org.uk/montacute-house,['twitter.com/montacutent'],"['www.facebook.com/montacutent', 'en-gb.facebo...",0,222,msg_count_facebook_decile_sample_9.0,,True
43,mm.hha.144,Hoghton Tower,Preston,https://www.hoghtontower.co.uk/,twitter.com/hoghtontower,['www.facebook.com/pages/hoghton-tower/1366739...,354,629,msg_count_twitter_decile_sample_7.0,,True
90,mm.domus.SE504,St Neots Museum,Huntingdon,https://www.stneotsmuseum.org.uk/,twitter.com/stneotsmuseum,"['facebook.com/stneotsmuseum', 'en-gb.facebook...",2886,936,msg_count_facebook_decile_sample_4.0,,True
41,mm.hha.036,Fursdon,Cadbury,https://www.fursdon.co.uk/,twitter.com/fursdondevon,"['www.facebook.com/fursdondevon', 'en-gb.faceb...",387,0,msg_count_twitter_decile_sample_7.0,,True
152,mm.domus.SE390,"Christ Church Picture Gallery, University Of O...",Oxford,https://www.chch.ox.ac.uk/gallery,"['twitter.com/chch_oxford', 'twitter.com/chchg...",['www.facebook.com/chchoxford'],1698,33,msg_count_facebook_decile_sample_10.0,N,False
21,mm.domus.NW188,British Commercial Vehicle Museum,Preston,https://www.britishcommercialvehiclemuseum.com/,twitter.com/bcvmleyland,['www.facebook.com/bcvmt'],762,0,msg_count_twitter_decile_sample_5.0,,True
54,mm.domus.SC201,Urras Eachdraidh Sgire Bhearnaraidh,Isle of Lewis,https://www.totalgiving.co.uk/charity-director...,['twitter.com/total_giving'],['facebook.com/totalgiving'],149,0,msg_count_twitter_decile_sample_8.0,N,False
17,mm.domus.SE538,Blickling Hall,Norwich,https://www.nationaltrust.org.uk/features/the-...,twitter.com/blicklingnt,en-gb.facebook.com/BlicklingEstateNT,1585,652,msg_count_twitter_decile_sample_4.0,,True
76,mm.domus.EM098,The Princess Royal Class Locomotive Trust,Ripley,http://www.prclt.co.uk/,twitter.com/prclt6233,"['www.facebook.com/groups/105180429515813', 'w...",14,1084,msg_count_twitter_decile_sample_10.0,,True
113,mm.domus.SW036,Dean Forest Railway Museum,Lydney,https://deanforestrailway.co.uk/museum/,twitter.com/deanforrailway,"['www.facebook.com/deanforestrailway', 'en-gb....",334,587,msg_count_facebook_decile_sample_6.0,,True


In [26]:
decile_accuracy_dfs = []
for smpl, subdf in sample_df.groupby('sample'):
    #print(smpl)
    accuracy = 1-(len(subdf[~subdf.b_valid])/len(subdf))
    accuracy_m = len(subdf[subdf.correct!='N'])/len(subdf)
    decile_accuracy_dfs.append({'sample':smpl, 'accuracy':accuracy, 'accuracy_m':accuracy_m})
    
decile_accuracy_df = pd.DataFrame(decile_accuracy_dfs)
display(decile_accuracy_df)
decile_accuracy_df.to_excel(out_folder+'data/annotations/museum_social_media_sample_by_decile-results-v2.xlsx', index=False)

Unnamed: 0,sample,accuracy,accuracy_m
0,msg_count_facebook_decile_sample_10.0,0.8,0.9
1,msg_count_facebook_decile_sample_3.0,0.9,0.9
2,msg_count_facebook_decile_sample_4.0,0.8,1.0
3,msg_count_facebook_decile_sample_5.0,0.9,0.9
4,msg_count_facebook_decile_sample_6.0,0.7,0.8
5,msg_count_facebook_decile_sample_7.0,0.9,1.0
6,msg_count_facebook_decile_sample_8.0,0.8,0.8
7,msg_count_facebook_decile_sample_9.0,0.9,0.9
8,msg_count_twitter_decile_sample_10.0,1.0,1.0
9,msg_count_twitter_decile_sample_3.0,0.8,0.8


#### Total stats

In [18]:
# activity stats
for plat in ['twitter','facebook']:
    df = act_df.copy()
    df['msg_count_'+plat] = df['msg_count_'+plat] + 1
    ax = sns.histplot(data=df, x="msg_count_"+plat, kde=True, log_scale=True)
    ax.set_title('Museums on '+plat)
    ax.set_xlabel('Number of messages (total)')
    ax.set_ylabel('N of museums')
    fout = cur_folder+'museum_activity_hist_{}.pdf'.format(plat)
    plt.savefig(fout)
    plt.clf()
    print(fout)
    del df

../../data/analysis/social_media_analytics/museum_activity_hist_twitter.pdf
../../data/analysis/social_media_analytics/museum_activity_hist_facebook.pdf


<Figure size 432x288 with 0 Axes>

#### Calc and viz group stats

In [25]:
# governance / size stats
# attr

for x in [cur_folder+'pivot_tables/']:
    print(x)
    if not os.path.exists(x):
        os.makedirs(x)

def q25(x): return x.quantile(0.25)

def q75(x): return x.quantile(0.75)

def active_mus_n(x): return len(x[x>0])

def active_mus_pc(x):
    # dummy value
    return -1

def active_mus_pc_z(x):
    # dummy value
    return -1

def msg_per_mus(x):
    # dummy value
    return -1

def make_multilevel_df_flat(df):
    vdf = df.reset_index(level=[0,1])
    assert len(vdf.columns)==3
    # from long form to narrow form
    mdf = vdf.pivot(index=vdf.columns[0], columns=vdf.columns[1], values=vdf.columns[2])
    return mdf

def plot_bivar_heatmap(df, label, var, out_fold):
    print('plot_bivar_heatmap:', label, var)
    
    df = df.copy()    
    df.columns = ["_".join(a) for a in df.columns.to_flat_index()]
    cmap = "YlGnBu"
    #cmap = "YlOrBr"
    if 'pc_z' in var:
        cmap = "vlag_r"
    if '_twitter_count' in var or '_facebook_count' in var:
        cmap = "OrRd"
    assert var in df.columns, "{} not in {}".format(var, df.columns)
    df = df[var]
    df = make_multilevel_df_flat(df)
    
    # tune font sizes
    font_scale = .8
    font_sz = 8
    n_cells = len(df.columns) * len(df)
    if n_cells > 25:
        # small plots
        font_scale = .5
        font_sz = 3
    if n_cells > 50:
        # small plots
        font_scale = .3
        font_sz = 2
    sns.set(font_scale=font_scale)
    
    # plot missing values
    sns.heatmap(
        np.where(df.isna(), 0, np.nan),
        cbar=False,
        annot=np.full_like(df, "NA", dtype=object),
        fmt="", 
        annot_kws={"size": font_sz, "va": "center_baseline", "color": "black"},
        cmap=ListedColormap(['whitesmoke']),
        linewidth=0)
    
    # plot heatmap
    ax = sns.heatmap(df, annot=True, linewidth=2, square=True, annot_kws={"size": font_sz},
                     cmap=cmap, fmt='g', linecolor='white', cbar_kws={"shrink": .5})
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    
    # fix for count name confusion
    var = var.replace('msg_count_twitter_count','museums_n')
    var = var.replace('msg_count_facebook_count','museums_n')
    
    ax.set_title("[{}] by [{}]".format(var, label))
    plt.tight_layout()
    fn = out_fold + 'heatmap_bivar__{}-{}.pdf'.format(label,var)
    plt.savefig(fn)
    plt.clf()
    sns.set(font_scale=1)
    print(fn)

def activity_stats_mus_groups(df, vars):
    print('\nactivity_stats_mus_groups', vars)
    #print(df.columns)
    if len(vars) == 2:
        if vars[0] in vars[1] or vars[1] in vars[0]: 
            return None
    stats_cols = ['sum','mean','std','min',q25,'median',q75,'max','count',
                  msg_per_mus,active_mus_n,active_mus_pc,active_mus_pc_z]
    stats_df = df.groupby(vars).agg({
        'msg_count_twitter': stats_cols,
        'msg_count_facebook': stats_cols
    }).round(1)
    # calculate derived fields
    for c in ['msg_count_twitter','msg_count_facebook']:
        stats_df[(c,'active_mus_pc')] = round(stats_df[(c,'active_mus_n')]/stats_df[(c,'count')]*100,1)
        stats_df[(c,'msg_per_mus')] = round(stats_df[(c,'sum')]/stats_df[(c,'active_mus_n')],1)
        x = stats_df[(c,'active_mus_pc')]
        stats_df[(c,'active_mus_pc_z')] = round((x - x.mean())/x.std(), 1)
        stats_df[(c,'active_mus_pc_z')]
    return stats_df

../../data/analysis/social_media_analytics/pivot_tables/


In [26]:
# generate plots
var_combinations = ['subject_matter_simpl','governance','governance_simpl','size','subject_matter',
        'accreditation','region','country']
var_combinations2 = []
for x1 in var_combinations:
    for x2 in var_combinations:
        if x1 != x2 and (not x2 in x1 and not x1 in x2):
            var_combinations2.append([min(x1,x2),max(x1,x2)])
for v in var_combinations:
    var_combinations2.append([v])
    
del var_combinations
#var_combinations = [['governance'],['governance','region'],['size','governance_simpl',['subject_matter_simpl']]] # DEBUG

# generate all combinations
for attrs in var_combinations2:
    # calc stats
    attrs = list(attrs)
    df = activity_stats_mus_groups(act_df, attrs)
    if df is None: continue
    attrs_str = '-'.join(attrs)
    fcomb_fold = cur_folder+'pivot_tables/'+attrs_str+'/'
    print(fcomb_fold)
    if not os.path.exists(fcomb_fold):
        os.makedirs(fcomb_fold)
    
    if len(attrs) == 2:
        # plot tables with heatmaps
        for plat in ['msg_count_facebook','msg_count_twitter']:
            for var in ['count','median','mean','q75','active_mus_pc','active_mus_pc_z','msg_per_mus']:
                target_var = plat + '_' + var
                plot_bivar_heatmap(df, attrs_str, target_var, fcomb_fold)
                
    fout = fcomb_fold+'museum_activity_groups__var{}-{}.xlsx'.format(len(attrs), attrs_str)
    df.to_excel(fout,index=True)
    
    # plot
    if False:
        for att in attrs:
            ax = sns.boxplot(x="msg_count_twitter", y=att, # hue="governance",
                         data=act_df, palette="Set3")
            ax.set_xscale("log")
            plt.show()
            plt.clf()


activity_stats_mus_groups ['governance', 'subject_matter_simpl']
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/heatmap_bivar__governance-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/heatmap_bivar__governance-subject_matter_simpl-msg_count_facebook_median.pdf
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_mean
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/heatmap_bivar__governance-subject_matter_simpl-msg_count_facebook_mean.pdf
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_q75
../../data/analysis/social_media_analytics/pivot_tables/govern

  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_q75.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_msg_per_mus
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_count


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_median.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_mean.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_q75.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_msg_per_mus.pdf

activity_stats_mus_groups ['region', 'subject_matter_simpl']
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/
plot_bivar_heatmap: region-subject_matter_simpl msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/heatmap_bivar__region-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: region-subject_matter_simpl msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/heatmap_bivar__region-subject_matter_simpl-msg_count_facebook_median.pdf
plot_bivar_heatmap: region-subject_matter_simpl msg_count_facebook_mean
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/heatmap_bivar__region-subject_matter_simpl-msg_count_facebook_mean.pdf
plot_b

  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-museums_n.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_q75.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_count


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-museums_n.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_median.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_mean.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_q75.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_msg_per_mus.pdf

activity_stats_mus_groups ['region', 'size']
../../data/analysis/social_media_analytics/pivot_tables/region-size/
plot_bivar_heatmap: region-size msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/region-size/heatmap_bivar__region-size-museums_n.pdf
plot_bivar_heatmap: region-size msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_tables/region-size/heatmap_bivar__region-size-msg_count_facebook_median.pdf
plot_bivar_heatmap: region-size msg_count_facebook_mean
../../data/analysis/social_media_analytics/pivot_tables/region-size/heatmap_bivar__region-size-msg_count_facebook_mean.pdf
plot_bivar_heatmap: region-size msg_count_facebook_q75
../../data/analysis/social_media_analytics/pivot_tables/region-size/heatmap_bivar__region-size-msg_count_facebook_q75.pdf
plot_bivar_heatmap: region-size msg_c

  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_q75.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_msg_per_mus
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_count


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_median.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_mean.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_q75.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_msg_per_mus.pdf

activity_stats_mus_groups ['accreditation', 'governance']
../../data/analysis/social_media_analytics/pivot_tables/accreditation-governance/
plot_bivar_heatmap: accreditation-governance msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/accreditation-governance/heatmap_bivar__accreditation-governance-museums_n.pdf
plot_bivar_heatmap: accreditation-governance msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_tables/accreditation-governance/heatmap_bivar__accreditation-governance-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-governance msg_count_facebook_mean
../../data/analysis/social_media_analytics/pivot_tables/accreditation-governance/heatmap_bivar__accreditation-governance-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accreditation-gover

  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-museums_n.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_q75.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_count
../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-museums_n.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)
  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)
  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_median.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_mean
../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_mean.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_q75.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_active_mus_pc_z


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_msg_per_mus.pdf

activity_stats_mus_groups ['accreditation', 'subject_matter']
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter/
plot_bivar_heatmap: accreditation-subject_matter msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter/heatmap_bivar__accreditation-subject_matter-museums_n.pdf
plot_bivar_heatmap: accreditation-subject_matter msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter/heatmap_bivar__accreditation-subject_matter-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-subject_matter msg_count_facebook_mean
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter/heatmap_bivar__accreditation-subject_matter-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accredi

<Figure size 432x288 with 0 Axes>

### All tweets/posts over time

Summarise tweets using pandas aggregation.

In [21]:
def generate_time_plots(df, label, aggr_func):
    print('generate_time_plots', label)
    print('date range:', min(df.index), max(df.index))
    tdf = pd.DataFrame()
    for time_aggr in ['W','D','M','Q']:
        print(time_aggr)
        tmpdf = aggr_func(df, time_aggr)

        # plot tweets
        fig, ax = plt.subplots()
        tmpdf['count'].plot(linewidth=1, figsize=(20,7))
        plt.title('All museums - N={} - {} over time by {}'.format(len(df), label, time_aggr))
        plt.xlabel('Time')
        plt.grid(True, which='both')
        plt.ylabel('N  '+label)
        #ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
        fig_fn = cur_folder+'{}_temporal-all_mus-{}.pdf'.format(label,time_aggr)
        plt.savefig(fig_fn)
        print(fig_fn)
        #plt.show()
        
        plt.clf(); plt.cla(); plt.close()

        # add data
        tmpdf = tmpdf.reset_index()
        tdf.append(tmpdf)

    return tdf

def count_messages(df, time_aggr):
    tmpdf = df.resample(time_aggr).size().to_frame('count')
    tmpdf['time_unit'] = time_aggr
    return tmpdf

# generate tweet plots
df = pd.read_pickle(tw_fn)
print(len(df))
print(df.sample(10))
generate_time_plots(df, 'tweets', count_messages)

# generate facebook plots
df = pd.read_pickle(fb_fn)
print(len(df))
print(df.sample(10))
generate_time_plots(df, 'facebook_posts', count_messages)

5481882
                                museum_id          account
ts                                                        
2019-11-27 22:56:28+00:00  mm.domus.WM146     rugbygallery
2020-12-18 12:53:16+00:00     mm.aim.0181   computermuseum
2020-06-09 19:37:16+00:00  mm.domus.SC301  trimontiumtrust
2021-08-19 15:16:54+00:00     mm.aim.0581      janet_hills
2019-02-07 09:44:07+00:00  mm.domus.WM038        rafmuseum
2019-07-14 13:14:08+00:00     mm.musa.268  policeserviceni
2019-07-29 14:27:37+00:00     mm.ace.1120    essexpoliceuk
2019-10-16 10:37:13+00:00     mm.musa.010      apsleyhouse
2019-01-17 12:00:42+00:00  mm.domus.NE004   beamish_museum
2019-07-14 16:43:42+00:00     mm.ace.1120    essexpoliceuk
generate_time_plots tweets
date range: 2019-01-01 00:00:00+00:00 2021-10-01 00:00:00+00:00
W
../../data/analysis/social_media_analytics/tweets_temporal-all_mus-W.pdf
D
../../data/analysis/social_media_analytics/tweets_temporal-all_mus-D.pdf
M
../../data/analysis/social_media_analytic

### Active museums over time

Count museums that are active on FB/TW in a give period

In [9]:
def count_museums(df, time_aggr):
    df = df[['museum_id']]
    tmpdf = df.resample(time_aggr).nunique()
    tmpdf.columns = ['count']
    tmpdf['time_unit'] = time_aggr
    return tmpdf

# generate tweet plots
df = pd.read_pickle(tw_fn)
print(len(df))
generate_time_plots(df, 'museums_on_tw', count_museums)

# generate facebook plots
df = pd.read_pickle(fb_fn)
print(len(df))
generate_time_plots(df, 'museums_on_fb', count_museums)

5499105
generate_time_plots museums_on_tw
date range: 2019-01-01 00:00:00+00:00 2021-10-05 00:00:00+00:00
W
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-W.pdf
D
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-D.pdf
M
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-M.pdf
Q
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-Q.pdf
1191395
generate_time_plots museums_on_fb
date range: 2019-01-01 00:00:00+00:00 2021-10-04 23:55:37+00:00
W
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-W.pdf
D
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-D.pdf
M
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-M.pdf
Q
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-Q.pdf


### Daily heatmaps

- Calplot package: https://www.analyticsvidhya.com/blog/2021/02/visualization-in-time-series-using-heatmaps-in-python/

In [14]:
!pip install calplot

Collecting calplot
  Downloading calplot-0.1.7.3-py3-none-any.whl (8.1 kB)
Installing collected packages: calplot
Successfully installed calplot-0.1.7.3


In [22]:
import calplot

def plot_daily_heatmap(df, label):
    # count tweets per day
    cmap = None
    cdf = df[['museum_id']].resample('D').count()
    ax = calplot.calplot(cdf['museum_id'], yearlabels=True)
    fig_fn = cur_folder+'daily_heatmap-all_mus_heatmap_{}-msg.pdf'.format(label)
    plt.savefig(fig_fn)
    plt.clf()
    del cdf
    # count active museums per day
    mdf = df[['museum_id']].resample('D').nunique()
    ax = calplot.calplot(mdf['museum_id'], yearlabels=True)
    fig_fn = cur_folder+'daily_heatmap-all_mus_heatmap_{}-active_mus.pdf'.format(label)
    #plt.tight_layout()
    plt.savefig(fig_fn)
    print(fig_fn)
    plt.clf()
    del mdf

df = pd.read_pickle(tw_fn)
plot_daily_heatmap(df, 'twitter')

df = pd.read_pickle(fb_fn)
plot_daily_heatmap(df, 'facebook')

  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(


../../data/analysis/social_media_analytics/daily_heatmap-all_mus_heatmap_twitter-active_mus.pdf


  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(


../../data/analysis/social_media_analytics/daily_heatmap-all_mus_heatmap_facebook-active_mus.pdf


<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

End of notebook.