# Museums in the Pandemic - Social media analytics

**Authors**: Andrea Ballatore (KCL)

**Abstract**: Analysis of social media data

## Setup
This is to check that your environment is set up correctly (it should print 'env ok', ignore warnings).

In [156]:
# Test geospatial libraries
# check environment
import os
print("Conda env:", os.environ['CONDA_DEFAULT_ENV'])
if os.environ['CONDA_DEFAULT_ENV'] != 'mip_v1':
    raise Exception("Set the environment 'mip_v1' on Anaconda. Current environment: " + os.environ['CONDA_DEFAULT_ENV'])

# spatial libraries 
import pandas as pd
import random
import pickle
import spacy
import itertools
import re
from termcolor import colored
import sys
from matplotlib.colors import ListedColormap, TwoSlopeNorm
from matplotlib.ticker import MultipleLocator, FormatStrFormatter,AutoMinorLocator
from matplotlib.dates import DateFormatter, MonthLocator, YearLocator
import numpy as np
import calplot
from numpy import arange
#import tensorflow as tf
from bs4 import BeautifulSoup
from bs4.element import Comment
#import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

# import from `mip` project
print(os.getcwd())
fpath = os.path.abspath('../')
if not fpath in sys.path:
    sys.path.insert(0, fpath)

out_folder = '../../'

from museums import *
from utils import _is_number, write_file
from analytics.text_models import derive_new_attributes_matches, get_all_matches_from_db, get_indicator_annotations
from museums import get_museums_w_web_urls, get_twitter_facebook_links_v2, get_extra_museum_attributes

cur_folder = out_folder + 'data/analysis/social_media_analytics/'
if not os.path.exists(cur_folder):
    os.makedirs(cur_folder)
    
print('env ok')

Conda env: mip_v1
/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/museums-in-the-pandemic/mip/notebooks_py
env ok


## Connect to DB

It needs the DCS VPN active to work.

In [48]:
# open connection to DB
from db.db import connect_to_postgresql_db

db_conn = connect_to_postgresql_db()
print("DB connected")

DB connected


## Load museum info


In [37]:
# load museums
df = get_museums_w_web_urls(out_folder)
print("museums url N:",len(df))
attr_df = load_input_museums_wattributes(out_folder)
df = pd.merge(df, attr_df, on='muse_id', how='left')
print("museum df with attributes: len", len(df))
mus_df = df.rename(columns={'muse_id':'museum_id','musname_x':'musname'})
del df
print(len(mus_df), mus_df.columns)

links_df = get_twitter_facebook_links_v2(out_folder)[['museum_id', 'twitter_id', 'facebook_pages']]
mus_df = mus_df.merge(links_df, on='museum_id', how='left')
mus_df = get_extra_museum_attributes(mus_df)
del links_df
#print(len(mus_df), mus_df.columns)
mus_df.sample(50)

museums urls: ../../data/museums/museum_websites_urls-v3.tsv
nationaltrust.org.uk       179
english-heritage.org.uk     52
no_resource.                33
visitscotland.com           24
nts.org.uk                  21
                          ... 
glynnvivian.co.uk            1
glynde.co.uk                 1
gwsr.com                     1
gloucesterquays.co.uk        1
smithsonfarm.co.uk           1
Name: domain, Length: 2441, dtype: int64
get_museums_w_web_urls Museums=3344 URLs=3344
museums url N: 3344
Index(['musname', 'muse_id', 'town', 'postcode', 'accreditation', 'governance',
       'size', 'subject_matter', 'closing_date', 'provenance',
       'deprivation_index', 'geodemographic_group', 'geodemographic_subgroup',
       'admin_area'],
      dtype='object')
loaded museums w attributes (open): 3341 data/museums/museums_wattributes-2020-02-23.tsv
museum df with attributes: len 3344
3344 Index(['museum_id', 'musname', 'town_x', 'url', 'url_source', 'Unnamed: 5',
       'domain', 'm

  df['region'] = df['region'].str.replace(


Unnamed: 0,museum_id,musname,town_x,url,url_source,Unnamed: 5,domain,musname_y,town_y,postcode,...,deprivation_index,geodemographic_group,geodemographic_subgroup,admin_area,twitter_id,facebook_pages,governance_simpl,subject_matter_simpl,country,region
1096,mm.hha.075,Gorhambury,St. Albans,https://www.gorhamburyestate.co.uk/,pred,,gorhamburyestate.co.uk,Gorhambury,St. Albans,AL3 6AH,...,6.0,Rural-Urban Fringe,Rural-Urban Fringe,/England/East of England (English Region)/Hert...,no_resource,no_resource,independent,Buildings,England,East of England
1897,mm.aim.0677,National Badminton Museum,Milton Keynes,https://www.nationalbadmintonmuseum.com/,manual,,nationalbadmintonmuseum.com,National Badminton Museum,Milton Keynes,MK8 9LA,...,8.0,Suburban Traits,Expanding Areas,/England/South East (English Region)/Milton Ke...,no_resource,no_resource,independent,Leisure and sport,England,South East
1002,mm.ace.1157,Fordingbridge Museum,Fordingbridge,http://www.fordingbridgemuseum.co.uk/,pred,,fordingbridgemuseum.co.uk,Fordingbridge Museum,Fordingbridge,SP6 1AB,...,6.0,Remoter Coastal Living,Ageing Coastal Living,/England/South East (English Region)/Hampshire...,no_resource,en-gb.facebook.com/pages/category/Community-Mu...,independent,Local Histories,England,South East
3316,mm.aim.0619,Military Museum Scotland,Wilkieston,no_resource,manual,,no_resource.,Military Museum Scotland,Wilkieston,EH27 8EJ,...,5.0,Scottish Industrial Heritage,Scottish Industrial Legacy,/Scotland/West Lothian (Scottish Council Area),no_resource,en-gb.facebook.com/Military-Museum-Scotland-SC...,independent,War and conflict,Scotland,Scotland
1032,mm.domus.SC045,Fyvie Castle,Turriff,https://www.nts.org.uk/visit/places/fyvie-castle,pred,,nts.org.uk,Fyvie Castle,Turriff,AB53 8JS,...,5.0,Country Living,Country Living,/Scotland/Aberdeenshire (Scottish Council Area),no_resource,"['www.facebook.com/fyviecastle', 'en-gb.facebo...",independent,Buildings,Scotland,Scotland
506,mm.ace.1106,Castle Donington Museum,Derby,"https://www.cdpc.org.uk/findmynearest,876189.html",pred,,cdpc.org.uk,Castle Donington Museum,Derby,DE74 2JA,...,6.0,Country Living,Country Living,/England/East Midlands (English Region)/Leices...,"['twitter.com/castledonpc', 'twitter.com/cas_d...","['en-gb.facebook.com/castledoningtonpc', 'en-g...",independent,Local Histories,England,East Midlands
1636,mm.domus.SE012,Maldon District Museum,Maldon,https://e-voice.org.uk/maldonmuseuminthepark/,pred,,e-voice.org.uk,Maldon District Museum,Maldon,CM9 5HX,...,3.0,Country Living,Country Living,/England/East of England (English Region)/Esse...,twitter.com/maldonmuseum,en-gb.facebook.com/maldonmuseum,independent,Local Histories,England,East of England
612,mm.ace.1099,Clevedon Court,Clevedon,https://www.nationaltrust.org.uk/clevedon-court,pred,,nationaltrust.org.uk,Clevedon Court,Clevedon,BS21 6QU,...,9.0,English and Welsh Countryside,Sparse English and Welsh Countryside,/England/South West (English Region)/North Som...,twitter.com/clevedoncourtNH,en-gb.facebook.com/pages/category/Nursing-Home...,independent,Buildings,England,South West
3168,mm.mgs.358,Westside Croft,Dunnet,http://www.caithness.org/community/museums/mar...,manual,,caithness.org,Westside Croft,Dunnet,KW14 8YD,...,6.0,Scottish Countryside,Scottish Countryside,/Scotland/Highland (Scottish Council Area),no_resource,no_resource,independent,Buildings,Scotland,Scotland
1594,mm.domus.SE331,Long Shop Museum,Leiston,https://www.facebook.com/Longshopmuseum/,manual,,facebook.com,Long Shop Museum,Leiston,IP16 4ES,...,5.0,English and Welsh Countryside,Sparse English and Welsh Countryside,/England/East of England (English Region)/Suff...,twitter.com/longshopmuseum,en-gb.facebook.com/Longshopmuseum,independent,Industry and manufacture,England,East of England


## Temporal analysis of Twitter/Facebook

Temporal analysis based on message counts.

### Manual corrections of top decile

The largest museums tend to have less accurate accounts, so we fixed them manually. The rest is ok (see validation results below). `msg_count_by_museum_manual_validation-v2.xlsx` was annotated by Mark L and Jamie L.

Examples of corrections:
- mm.misc.139: drop twitter.com/atlasobscura, drop www.facebook.com/atlasobscura
- Twitter: mm.domus.YH123 update https://twitter.com/WFMuseums (drop 'twitter.com/mywakefield', 'twitter.com/pontecastle')
- Facebook: mm.domus.SC121 update http://www.facebook.com/guernseymuseums (drop ['www.facebook.com/grantownmuseum', 'en-gb.facebook.com/grantownmuseum/posts'])

In [10]:
corrections_df = pd.read_excel(out_folder+'data/annotations/msg_count_by_museum_manual_validation-v2.xlsx')
corrections_df

# drop_twitter	drop_facebook	correct_twitter	correct_facebook	drop_website	correct_website

corrections_df['b_valid_twitter'] = corrections_df.drop_twitter.isnull()
corrections_df['b_valid_facebook'] = corrections_df.drop_facebook.isnull()
corrections_df['b_new_twitter'] = ~corrections_df.correct_twitter.isnull()
corrections_df['b_new_facebook'] = ~corrections_df.correct_facebook.isnull()

corrections_df = corrections_df[['museum_id','drop_twitter','b_valid_twitter','drop_facebook','b_valid_facebook','correct_twitter','correct_facebook','b_new_twitter','b_new_facebook','correct_website']]
print(corrections_df.b_valid_twitter.value_counts())
print(corrections_df.b_new_twitter.value_counts())
print(corrections_df.b_valid_facebook.value_counts())
print(corrections_df.b_new_facebook.value_counts())

corrections_df['facebook_action'] = 'keep'
corrections_df.loc[(~corrections_df.b_valid_facebook & ~corrections_df.b_new_facebook) , 'facebook_action'] = 'drop'
corrections_df.loc[(corrections_df.b_new_facebook), 'facebook_action'] = 'update'
print(corrections_df.facebook_action.value_counts())

corrections_df['twitter_action'] = 'keep'
corrections_df.loc[(~corrections_df.b_valid_twitter & ~corrections_df.b_new_twitter) , 'twitter_action'] = 'drop'
corrections_df.loc[(corrections_df.b_new_twitter), 'twitter_action'] = 'update'
print(corrections_df.twitter_action.value_counts())

corrections_df[['museum_id','twitter_action','facebook_action','correct_twitter','correct_facebook']].to_csv(
    out_folder+'data/museums/social_media_url_corrections.tsv', sep = '\t', index=False)


True     3201
False     143
Name: b_valid_twitter, dtype: int64
False    3313
True       31
Name: b_new_twitter, dtype: int64
True     3206
False     138
Name: b_valid_facebook, dtype: int64
False    3304
True       40
Name: b_new_facebook, dtype: int64
keep      3169
drop       135
update      40
Name: facebook_action, dtype: int64
keep      3172
drop       141
update      31
Name: twitter_action, dtype: int64


In [11]:
# apply corrections
corrections_df = pd.read_csv(out_folder+'data/museums/social_media_url_corrections.tsv', sep = '\t')
print(corrections_df.columns, len(corrections_df))
assert corrections_df.museum_id.is_unique

mus_df = get_twitter_facebook_links(out_folder)[['museum_id', 'twitter_id', 'facebook_pages']]
print(mus_df.columns)
corrected_mus_df = mus_df.copy().merge(corrections_df, on='museum_id')
print(corrected_mus_df.columns)

# fix twitter
ids = corrections_df.loc[corrections_df.twitter_action == 'drop', 'museum_id']
print(len(ids))
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'twitter_id_old'] = corrected_mus_df['twitter_id']
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'twitter_id'] = 'no_resource'
ids = corrections_df.loc[corrections_df.twitter_action == 'update', 'museum_id']
corrected_mus_df.loc[corrected_mus_df.twitter_action=='update', 'twitter_id_old'] = corrected_mus_df['twitter_id']
corrected_mus_df.loc[corrected_mus_df.twitter_action=='update', 'twitter_id'] = corrected_mus_df['correct_twitter']

# fix facebook
ids = corrections_df.loc[corrections_df.facebook_action == 'drop', 'museum_id']
print(len(ids))
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'facebook_pages_old'] = corrected_mus_df['facebook_pages']
corrected_mus_df.loc[corrected_mus_df.museum_id.isin(ids), 'facebook_pages'] = 'no_resource'
ids = corrections_df.loc[corrections_df.facebook_action == 'update', 'museum_id']
corrected_mus_df.loc[corrected_mus_df.facebook_action=='update', 'facebook_pages_old'] = corrected_mus_df['facebook_pages']
corrected_mus_df.loc[corrected_mus_df.facebook_action=='update', 'facebook_pages'] = corrected_mus_df['correct_facebook']

print('> twitter')
print(mus_df.twitter_id.value_counts())
print(corrected_mus_df.twitter_id.value_counts())
print('> facebook')
print(mus_df.facebook_pages.value_counts())
print(corrected_mus_df.facebook_pages.value_counts())

corrected_mus_df.to_csv(out_folder+'data/museums/social_media_urls_corrected.tsv', sep='\t', index=False)


Index(['museum_id', 'twitter_action', 'facebook_action', 'correct_twitter',
       'correct_facebook'],
      dtype='object') 3344
loaded museums: 3344 ../../data/museums/museum_names_and_postcodes-2020-01-26.tsv
get_twitter_facebook_links N = 3344
Index(['museum_id', 'twitter_id', 'facebook_pages'], dtype='object')
Index(['museum_id', 'twitter_id', 'facebook_pages', 'twitter_action',
       'facebook_action', 'correct_twitter', 'correct_facebook'],
      dtype='object')
141
135
> twitter
no_resource                    993
twitter.com/lancastermuseum      4
twitter.com/riponmuseums         3
twitter.com/chelmsmuseum         3
twitter.com/museumoflondon       3
                              ... 
twitter.com/gowerheritage        1
twitter.com/gosportdc            1
twitter.com/gosfordhouse         1
twitter.com/grussellmuseum       1
twitter.com/zetlandmuseum        1
Name: twitter_id, Length: 2270, dtype: int64
no_resource                       1118
twitter.com/lancastermuseum          

### Download Twitter/Facebook data from DB (slow)

Extracting only message counts, not content.

In [34]:
# file paths
tw_fn = out_folder+'tmp/tweets.pik'
twacc_fn = out_folder+'tmp/tweets_accounts.pik'

fb_fn = out_folder+'tmp/facebook_posts.pik'
fbacc_fn = out_folder+'tmp/facebook_accounts.pik'

In [49]:
max_time = '2021-10-01'

if False:
    # extract all tweets
    fields = ['tweet_text', 
              "tweet_data_json -> 'public_metrics' -> 'retweet_count' as retweet_count",
              "tweet_data_json -> 'public_metrics' -> 'reply_count' as reply_count",
              "tweet_data_json -> 'public_metrics' -> 'like_count' as like_count",
              "tweet_data_json -> 'public_metrics' -> 'quote_count' as quote_count"]
    sql = "select muse_id as museum_id, museum_account, author_account, is_reply, tw_ts as ts, {} from twitter.tweets_dump td where tw_ts <= date('{}');".format(', '.join(fields), max_time) #  limit 1000
    df = pd.read_sql(sql, db_conn)
    df['from_museum'] = df.museum_account.str.lower() == df.author_account.str.lower()
    df['ts'] = pd.to_datetime(df['ts'])
    print(df.columns)
    df = df.set_index('ts')
    display(df.sample(50))
    df.to_pickle(tw_fn)
    print(len(df))
    tw_fn
    
if False:
    # extract all Twitter account info
    sql = """select muse_id as museum_id, museum_account, author_account,
        tweet_data_json -> 'author_info' ->> 'id' as account_id, 
        tweet_data_json -> 'author_info' ->> 'name' as name, 
        tweet_data_json -> 'author_info' ->> 'username' as username, 
        tweet_data_json -> 'author_info' ->> 'created_at' as created_at, 
        tweet_data_json -> 'author_info' ->> 'verified' as verified, 
        tweet_data_json -> 'author_info' ->> 'location' as location, 
        tweet_data_json -> 'author_info' -> 'public_metrics' ->> 'followers_count' as followers_count,
        tweet_data_json -> 'author_info' -> 'public_metrics' ->> 'following_count' as following_count,
        tweet_data_json -> 'author_info' -> 'public_metrics' ->> 'tweet_count' as tweet_count,
        tweet_data_json -> 'author_info' -> 'public_metrics' ->> 'listed_count' as listed_count
        from twitter.tweets_dump td where tw_ts <= date('{}')""".format(max_time) # limit 1000
    sql = "select distinct * from ({}) as res;".format(sql)
    df = pd.read_sql(sql, db_conn)
    df['from_museum'] = df.museum_account.str.lower() == df.author_account.str.lower()
    display(df.sample(100))
    print(df.from_museum.describe())
    print(twacc_fn, 'N =',len(df))
    df.to_pickle(twacc_fn)
    
if True:
    # extract all FB messages
    fields = ["facebook_data_json -> 'statistics_actual_{}' as {}".format(f,f) for f in ['likeCount','shareCount','commentCount','loveCount','wowCount','hahaCount','sadCount','angryCount','thankfulCount','careCount']]
    sql = "select museum_id, page_name, user_id, post_text, facebook_data_json -> 'type' as post_type, facebook_data_json -> 'score' as score, post_ts as ts, {} from facebook.facebook_posts_dump where post_ts <= date('{}');".format(", ".join(fields), max_time) # limit 10000
    df = pd.read_sql(sql, db_conn)
    df['ts'] = pd.to_datetime(df['ts'])
    print(df.columns)
    display(df.sample(100))
    df = df.set_index('ts')
    df.to_pickle(fb_fn)
    print(len(df))

if False:
    # extract all Facebook accounts info
    fields = ["facebook_data_json ->> 'account_{}' as {}".format(f,f) for f in ['id','name','handle','profileImage','subscriberCount','url','accountType','pageDescription','pageCreatedDate','verified']]
    sql = """select museum_id, {}
        from facebook.facebook_posts_dump where post_ts <= date('{}')""".format(", ".join(fields), max_time) # limit 1000
    sql = "select distinct * from ({}) as res;".format(sql)
    df = pd.read_sql(sql, db_conn)
    #df['from_museum'] = df.museum_account.str.lower() == df.author_account.str.lower()
    display(df.sample(100))
    #print(df.from_museum.describe())
    print(fbacc_fn, 'N =',len(df))
    df.to_pickle(fbacc_fn)
    


Index(['museum_id', 'page_name', 'user_id', 'post_text', 'post_type', 'score',
       'ts', 'likecount', 'sharecount', 'commentcount', 'lovecount',
       'wowcount', 'hahacount', 'sadcount', 'angrycount', 'thankfulcount',
       'carecount'],
      dtype='object')


Unnamed: 0,museum_id,page_name,user_id,post_text,post_type,score,ts,likecount,sharecount,commentcount,lovecount,wowcount,hahacount,sadcount,angrycount,thankfulcount,carecount
880953,mm.mgs.372,NHSTayside,2067264,Quit Your Way information stand There will be ...,photo,-6.125000,2020-02-03 15:03:54+00:00,4,4,0,0,0,0,0,0,0,0
1182554,mm.domus.EM073,1620sHouse,10316492,We sadly cannot go ahead with our planned Gunp...,native_video,2.222222,2020-11-04 10:16:10+00:00,10,35,5,9,0,1,0,0,0,0
1130681,mm.New.12,ArchDaily,36040,"SOM, Studio Gang, the Museum Of Contemporary A...",link,-10.960000,2021-07-28 15:30:28+00:00,25,0,0,0,0,0,0,0,0,0
384803,mm.musa.162,hartlandquay,10428666,Tired of turkey? It’s a beautiful day to visit...,photo,-1.766667,2020-12-27 12:11:02+00:00,20,2,1,7,0,0,0,0,0,0
619225,mm.musa.252,NationalSpaceCentre,2163145,We've been learning about #PlasticFreeDay at t...,link,-24.000000,2019-05-08 16:30:00+00:00,8,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714236,mm.domus.NE008,prestonparkmuseum,2194173,It's our last Get Crafty of the summer holiday...,photo,-3.500000,2019-08-29 09:12:57+00:00,8,2,0,2,0,0,0,0,0,0
304663,mm.musa.138,FlambardsUK,3098271,Celebrate Father’s day in style with a special...,link,3.379310,2019-05-15 08:27:14+00:00,56,0,40,2,0,0,0,0,0,0
52555,mm.domus.EM015,bassetlawmuseum,8672970,Why knit alone? Come along to our Yarn Bomb Kn...,photo,-1.055556,2020-01-23 18:00:10+00:00,13,16,6,1,0,0,0,0,0,0
457893,mm.domus.SE042,InKingstonUK,8482850,"Hawkers Bar and Brasserie are set for summer, ...",photo,-3.153846,2019-07-10 14:10:00+00:00,8,2,3,0,0,0,0,0,0,0


1187529


### Get account engagement stats 

In [51]:
# TODO: add TW account info

# TODO: add FB account info

### Sample 300 messages for content description

March 2022

Sample of 300 tweets and 300 fb.

In [50]:
def generate_sample_300(fn,platform):
    df = pd.read_pickle(fn)
    if platform == 'twitter':
        df = df[df.from_museum] # only msgs from museums
    print(df.columns)
    #print(df.sample(10))
    df['year'] = df.index.year
    df['ts'] = df.index
    df = df.merge(mus_df, on='museum_id', how='outer')
    print(df.columns)
    dfs = []

    for dec, subdf in df.groupby(['governance_simpl','size','year']):
        if platform == 'twitter':
            smpl_df = subdf.sample(min(6,len(subdf)), random_state=10)
        else: 
            smpl_df = subdf.sample(min(7,len(subdf)), random_state=10)
        dfs.append(smpl_df)
    sample_df = pd.concat(dfs)
    print(sample_df.columns)
    del dfs, subdf

    print("N =",len(sample_df))
    sample_df = sample_df.sample(300)
    if platform == 'twitter':
        sample_df = sample_df[[
            'museum_id', 'musname', 'museum_account', 'author_account', 'from_museum', 'is_reply',
            'retweet_count', 'reply_count', 'like_count',
            'quote_count',  'year', 'tweet_text']]
    else: 
        sample_df = sample_df[[
           'museum_id', 'musname', 'page_name', 'user_id', 'score', 'likecount',
           'sharecount', 'commentcount', 'lovecount', 'wowcount', 'hahacount',
           'sadcount', 'angrycount', 'thankfulcount', 'carecount', 'year', 'post_type', 'post_text']]
    sample_df['annot_category'] = ''
    sample_df['annot_phrase'] = ''
    sample_df.to_excel(out_folder+'tmp/museum_social_media_sample_300-{}.xlsx'.format(platform),index=False)
    
generate_sample_300(tw_fn,'twitter')
generate_sample_300(fb_fn,'facebook')

Index(['museum_id', 'museum_account', 'author_account', 'is_reply',
       'tweet_text', 'retweet_count', 'reply_count', 'like_count',
       'quote_count', 'from_museum'],
      dtype='object')
Index(['museum_id', 'museum_account', 'author_account', 'is_reply',
       'tweet_text', 'retweet_count', 'reply_count', 'like_count',
       'quote_count', 'from_museum', 'year', 'ts', 'musname', 'town_x', 'url',
       'url_source', 'Unnamed: 5', 'domain', 'musname_y', 'town_y', 'postcode',
       'accreditation', 'governance', 'size', 'subject_matter', 'closing_date',
       'provenance', 'deprivation_index', 'geodemographic_group',
       'geodemographic_subgroup', 'admin_area', 'twitter_id', 'facebook_pages',
       'governance_simpl', 'subject_matter_simpl', 'country', 'region'],
      dtype='object')
Index(['museum_id', 'museum_account', 'author_account', 'is_reply',
       'tweet_text', 'retweet_count', 'reply_count', 'like_count',
       'quote_count', 'from_museum', 'year', 'ts', 'mus

### Activity by museum

#### Messages by museum

In [6]:
def messages_by_museum(df, label):
    print('messages_by_museum', label, len(df))
    mdf = df.groupby('museum_id').size()
    field = 'msg_count_'+label
    mdf = mdf.reset_index(name=field)
    mdf[field+'_decile'] = pd.qcut(-mdf[field], 10, labels=False)+1
    mdf[field+'_z'] = round((mdf[field] - mdf[field].mean())/mdf[field].std(),3)
    print(mdf.columns)
    return mdf
    
# generate tweet plots
df = pd.read_pickle(tw_fn)
# keep only tweets from museums

act_df = messages_by_museum(df[df.from_museum], 'twitter')
act_df2 = messages_by_museum(df[~df.from_museum], 'twitter_engage')
act_df = act_df.merge(act_df2, on='museum_id', how='outer')
act_df['msg_count_twitter_engage_ratio'] = round(act_df['msg_count_twitter_engage']/act_df['msg_count_twitter'],3)
field = 'msg_count_twitter_engage_ratio'
act_df[field+'_z'] = round((act_df[field] - act_df[field].mean())/act_df[field].std(),3)

df = pd.read_pickle(fb_fn)
act_df3 = messages_by_museum(df, 'facebook')
act_df = act_df.merge(act_df3, on='museum_id', how='outer')

# merge with all museums
act_df = act_df.merge(mus_df, on='museum_id', how='outer')
act_df['msg_count_facebook'] = act_df['msg_count_facebook'].fillna(0)
act_df['msg_count_twitter'] = act_df['msg_count_twitter'].fillna(0)
act_df['msg_count_twitter_engage'] = act_df['msg_count_twitter_engage'].fillna(0)

fout = cur_folder + 'msg_count_by_museum.xlsx'
act_df.to_excel(fout, index=False)

with open(cur_folder + 'msg_count_by_museum_info.txt', "w") as text_file:
    text_file.write("""
msg_count_twitter: number of tweets by the museum
msg_count_twitter_engage: number of tweets to the museum by other users
msg_count_facebook: number of FB messages by the museum

<variable>_z: z score (0 = average, positive = above average, negative = below average)
<variable>_decile: decile (1 = top, 10 = bottom)
    """)

fout

messages_by_museum twitter 2415636
Index(['museum_id', 'msg_count_twitter', 'msg_count_twitter_decile',
       'msg_count_twitter_z'],
      dtype='object')
messages_by_museum twitter_engage 935484
Index(['museum_id', 'msg_count_twitter_engage',
       'msg_count_twitter_engage_decile', 'msg_count_twitter_engage_z'],
      dtype='object')
messages_by_museum facebook 1187529
Index(['museum_id', 'msg_count_facebook', 'msg_count_facebook_decile',
       'msg_count_facebook_z'],
      dtype='object')


'../../data/analysis/social_media_analytics/msg_count_by_museum.xlsx'

#### Sample for Tw/Fb validation

Validation carried out on 2021 Nov 24.
Stratified by decile to validate the low deciles.

In [15]:
# msg_count_twitter_decile | msg_count_facebook_decile
dfs = []
for dec_var in ['msg_count_twitter_decile','msg_count_facebook_decile']:
    for dec, subdf in act_df.groupby(dec_var):
        if dec < 3: continue
        print(dec, len(subdf))
        smpl_df = subdf.sample(10, random_state=10)
        smpl_df['sample'] = "{}_sample_{}".format(dec_var,dec)
        dfs.append(smpl_df)
decile_sample_df = pd.concat(dfs)
print(decile_sample_df.columns)
del dfs, subdf

print("N =",len(decile_sample_df))
decile_sample_df = decile_sample_df[['museum_id','musname','town_x','url','twitter_id','facebook_pages',
                  'msg_count_twitter','msg_count_facebook','sample']]
decile_sample_df['correct'] = ''
decile_sample_df.to_excel(out_folder+'tmp/museum_social_media_sample_by_decile.xlsx',index=False)

3.0 222
4.0 222
5.0 222
6.0 221
7.0 225
8.0 221
9.0 226
10.0 216
3.0 121
4.0 119
5.0 121
6.0 122
7.0 120
8.0 120
9.0 121
10.0 121
Index(['museum_id', 'msg_count_twitter', 'msg_count_twitter_decile',
       'msg_count_twitter_z', 'msg_count_facebook',
       'msg_count_facebook_decile', 'msg_count_facebook_z', 'musname',
       'town_x', 'url', 'url_source', 'Unnamed: 5', 'domain', 'musname_y',
       'town_y', 'postcode', 'accreditation', 'governance', 'size',
       'subject_matter', 'closing_date', 'provenance', 'deprivation_index',
       'geodemographic_group', 'geodemographic_subgroup', 'admin_area',
       'twitter_id', 'facebook_pages', 'governance_simpl',
       'subject_matter_simpl', 'country', 'region', 'sample'],
      dtype='object')
N = 160


#### Validation results
Annotated manually by Mark L and Jamie L.

In [16]:
sample_df = pd.read_excel(out_folder+'data/annotations/museum_social_media_sample_by_decile-v2.xlsx')
print(sample_df.correct.value_counts())
sample_df['b_valid'] = ~sample_df.correct.isin(['N','M'])
sample_df.sample(10)

N    13
M     6
Name: correct, dtype: int64


Unnamed: 0,museum_id,musname,town_x,url,twitter_id,facebook_pages,msg_count_twitter,msg_count_facebook,sample,correct,b_valid
144,mm.domus.SW219,Montacute House,Montacute,https://www.nationaltrust.org.uk/montacute-house,['twitter.com/montacutent'],"['www.facebook.com/montacutent', 'en-gb.facebo...",0,222,msg_count_facebook_decile_sample_9.0,,True
43,mm.hha.144,Hoghton Tower,Preston,https://www.hoghtontower.co.uk/,twitter.com/hoghtontower,['www.facebook.com/pages/hoghton-tower/1366739...,354,629,msg_count_twitter_decile_sample_7.0,,True
90,mm.domus.SE504,St Neots Museum,Huntingdon,https://www.stneotsmuseum.org.uk/,twitter.com/stneotsmuseum,"['facebook.com/stneotsmuseum', 'en-gb.facebook...",2886,936,msg_count_facebook_decile_sample_4.0,,True
41,mm.hha.036,Fursdon,Cadbury,https://www.fursdon.co.uk/,twitter.com/fursdondevon,"['www.facebook.com/fursdondevon', 'en-gb.faceb...",387,0,msg_count_twitter_decile_sample_7.0,,True
152,mm.domus.SE390,"Christ Church Picture Gallery, University Of O...",Oxford,https://www.chch.ox.ac.uk/gallery,"['twitter.com/chch_oxford', 'twitter.com/chchg...",['www.facebook.com/chchoxford'],1698,33,msg_count_facebook_decile_sample_10.0,N,False
21,mm.domus.NW188,British Commercial Vehicle Museum,Preston,https://www.britishcommercialvehiclemuseum.com/,twitter.com/bcvmleyland,['www.facebook.com/bcvmt'],762,0,msg_count_twitter_decile_sample_5.0,,True
54,mm.domus.SC201,Urras Eachdraidh Sgire Bhearnaraidh,Isle of Lewis,https://www.totalgiving.co.uk/charity-director...,['twitter.com/total_giving'],['facebook.com/totalgiving'],149,0,msg_count_twitter_decile_sample_8.0,N,False
17,mm.domus.SE538,Blickling Hall,Norwich,https://www.nationaltrust.org.uk/features/the-...,twitter.com/blicklingnt,en-gb.facebook.com/BlicklingEstateNT,1585,652,msg_count_twitter_decile_sample_4.0,,True
76,mm.domus.EM098,The Princess Royal Class Locomotive Trust,Ripley,http://www.prclt.co.uk/,twitter.com/prclt6233,"['www.facebook.com/groups/105180429515813', 'w...",14,1084,msg_count_twitter_decile_sample_10.0,,True
113,mm.domus.SW036,Dean Forest Railway Museum,Lydney,https://deanforestrailway.co.uk/museum/,twitter.com/deanforrailway,"['www.facebook.com/deanforestrailway', 'en-gb....",334,587,msg_count_facebook_decile_sample_6.0,,True


In [26]:
decile_accuracy_dfs = []
for smpl, subdf in sample_df.groupby('sample'):
    #print(smpl)
    accuracy = 1-(len(subdf[~subdf.b_valid])/len(subdf))
    accuracy_m = len(subdf[subdf.correct!='N'])/len(subdf)
    decile_accuracy_dfs.append({'sample':smpl, 'accuracy':accuracy, 'accuracy_m':accuracy_m})
    
decile_accuracy_df = pd.DataFrame(decile_accuracy_dfs)
display(decile_accuracy_df)
decile_accuracy_df.to_excel(out_folder+'data/annotations/museum_social_media_sample_by_decile-results-v2.xlsx', index=False)

NameError: name 'sample_df' is not defined

#### Pivot tables

#### Total stats

In [42]:
# histograms of activity stats 
for plat in ['twitter','twitter_engage','facebook']:
    df = act_df.copy()
    df['msg_count_'+plat] = df['msg_count_'+plat] + 1
    ax = sns.histplot(data=df, x="msg_count_"+plat, kde=True, log_scale=True)
    ax.set_title('Museums on '+plat)
    ax.set_xlabel('Number of messages (total)')
    ax.set_ylabel('N of museums')
    fout = cur_folder+'museum_activity_hist_{}.pdf'.format(plat)
    plt.savefig(fout)
    plt.clf()
    print(fout)
    del df

../../data/analysis/social_media_analytics/museum_activity_hist_twitter.pdf
../../data/analysis/social_media_analytics/museum_activity_hist_twitter_engage.pdf
../../data/analysis/social_media_analytics/museum_activity_hist_facebook.pdf


<Figure size 432x288 with 0 Axes>

#### Calc and viz group stats

In [46]:
# governance / size stats
# attr

for x in [cur_folder+'pivot_tables/']:
    print(x)
    if not os.path.exists(x):
        os.makedirs(x)

def q25(x): return x.quantile(0.25)

def q75(x): return x.quantile(0.75)

def active_mus_n(x): return len(x[x>0])

def active_mus_pc(x):
    # dummy value
    return -1

def active_mus_pc_z(x):
    # dummy value
    return -1

def msg_per_mus(x):
    # dummy value
    return -1

def make_multilevel_df_flat(df):
    vdf = df.reset_index(level=[0,1])
    assert len(vdf.columns)==3
    # from long form to narrow form
    mdf = vdf.pivot(index=vdf.columns[0], columns=vdf.columns[1], values=vdf.columns[2])
    return mdf

def plot_bivar_heatmap(df, label, var, out_fold):
    print('plot_bivar_heatmap:', label, var)
    
    df = df.copy()    
    df.columns = ["_".join(a) for a in df.columns.to_flat_index()]
    cmap = "YlGnBu"
    #cmap = "YlOrBr"
    divnorm = None
    if 'pc_z' in var:
        # blue/red diverging palette centred on zero
        cmap = sns.diverging_palette(10, 240, n=9, as_cmap=True)
        divnorm = TwoSlopeNorm(vmin=df[var].min(), vcenter=0, vmax=df[var].max())
    if '_twitter_count' in var or '_facebook_count' in var:
        cmap = "OrRd"
    assert var in df.columns, "{} not in {}".format(var, df.columns)
    df = df[var]
    df = make_multilevel_df_flat(df)
    
    # tune font sizes
    font_scale = .8
    font_sz = 8
    n_cells = len(df.columns) * len(df)
    if n_cells > 25:
        # small plots
        font_scale = .5
        font_sz = 3
    if n_cells > 50:
        # small plots
        font_scale = .3
        font_sz = 2
    sns.set(font_scale=font_scale)
    
    # plot missing values
    if not 'pc_z' in var:
        sns.heatmap(
            np.where(df.isna(), 0, np.nan),
            cbar=False,
            annot=np.full_like(df, "NA", dtype=object),
            fmt="", 
            annot_kws={"size": font_sz, "va": "center_baseline", "color": "black"},
            cmap=ListedColormap(['whitesmoke']),
            linewidth=0)
    
    # plot heatmap
    ax = sns.heatmap(df, annot=True, linewidth=2, square=True, annot_kws={"size": font_sz}, norm=divnorm,
                     cmap=cmap, fmt='g', linecolor='white', cbar_kws={"shrink": .5})
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    
    # fix for count name confusion
    var = var.replace('msg_count_twitter_count','museums_n')
    var = var.replace('msg_count_facebook_count','museums_n')
    
    ax.set_title("[{}] by [{}]".format(var, label))
    plt.tight_layout()
    fn = out_fold + 'heatmap_bivar__{}-{}.pdf'.format(label,var)
    plt.savefig(fn)
    plt.clf()
    sns.set(font_scale=1)
    print(fn)

def activity_stats_mus_groups(df, vars):
    print('\nactivity_stats_mus_groups', vars)
    #print(df.columns)
    if len(vars) == 2:
        if vars[0] in vars[1] or vars[1] in vars[0]: 
            return None
    stats_cols = ['sum','mean','std','min',q25,'median',q75,'max','count',
                  msg_per_mus,active_mus_n,active_mus_pc,active_mus_pc_z]
    stats_df = df.groupby(vars).agg({
        'msg_count_twitter': stats_cols,
        'msg_count_twitter_engage': stats_cols,     
        'msg_count_facebook': stats_cols
    }).round(1)
    # calculate derived fields
    for c in ['msg_count_twitter','msg_count_twitter_engage','msg_count_facebook']:
        stats_df[(c,'active_mus_pc')] = round(stats_df[(c,'active_mus_n')]/stats_df[(c,'count')]*100,1)
        stats_df[(c,'msg_per_mus')] = round(stats_df[(c,'sum')]/stats_df[(c,'active_mus_n')],1)
        x = stats_df[(c,'active_mus_pc')]
        stats_df[(c,'active_mus_pc_z')] = round((x - x.mean())/x.std(), 1)
        stats_df[(c,'active_mus_pc_z')]
    return stats_df

../../data/analysis/social_media_analytics/pivot_tables/


In [None]:
doc_str = """Data dictionary:

* twitter: tweets by museums
* twitter_engage: tweets to museums by other users
* facebook: posts by museums

* msg_count_<platform>_count: number of messages on the platform.
* msg_count_<platform>_msg_per_mus: average number of messages per museum on a platform.
* msg_count_<platform>_active_mus_pc: percentage of museums in a group that are active on the platform.
* msg_count_<platform>_active_mus_pc_z: 0 = average, positive = above average, negative = below average.
* msg_count_<platform>_median: median number of messages in a group.
* msg_count_<platform>_mean: mean number of messages in a group.
"""

# generate plots
var_combinations = ['subject_matter_simpl','governance','governance_simpl','size','subject_matter',
        'accreditation','region','country']
var_combinations2 = []
for x1 in var_combinations:
    for x2 in var_combinations:
        if x1 != x2 and (not x2 in x1 and not x1 in x2):
            var_combinations2.append([min(x1,x2),max(x1,x2)])
for v in var_combinations:
    var_combinations2.append([v])
    
del var_combinations
#var_combinations2 = [['governance'],['governance','region']]#,['size','governance_simpl',['subject_matter_simpl']]] # DEBUG

# generate all combinations
for attrs in var_combinations2:
    # calc stats
    attrs = list(attrs)
    df = activity_stats_mus_groups(act_df, attrs)
    if df is None: continue
    # set up output folder
    attrs_str = '-'.join(attrs)
    fcomb_fold = cur_folder+'pivot_tables/'+attrs_str+'/'
    print(fcomb_fold)
    if not os.path.exists(fcomb_fold):
        os.makedirs(fcomb_fold)
    write_file(doc_str, fcomb_fold+'data_dictionary-pivot_tables.txt')
    
    
    # plot tables with heatmaps
    for plat in ['msg_count_facebook','msg_count_twitter','msg_count_twitter_engage']:
        for var in ['count','median','mean','q75','active_mus_pc','active_mus_pc_z','msg_per_mus']:
            target_var = plat + '_' + var
            if len(attrs) == 2:
                plot_bivar_heatmap(df, attrs_str, target_var, fcomb_fold)

    fout = fcomb_fold+'museum_activity_groups__var{}-{}.xlsx'.format(len(attrs), attrs_str)
    df.to_excel(fout,index=True)
    
    # plot
    if False:
        for att in attrs:
            ax = sns.boxplot(x="msg_count_twitter", y=att, # hue="governance",
                         data=act_df, palette="Set3")
            ax.set_xscale("log")
            plt.show()
            plt.clf()


activity_stats_mus_groups ['governance', 'subject_matter_simpl']
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/heatmap_bivar__governance-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/heatmap_bivar__governance-subject_matter_simpl-msg_count_facebook_median.pdf
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_mean
../../data/analysis/social_media_analytics/pivot_tables/governance-subject_matter_simpl/heatmap_bivar__governance-subject_matter_simpl-msg_count_facebook_mean.pdf
plot_bivar_heatmap: governance-subject_matter_simpl msg_count_facebook_q75
../../data/analysis/social_media_analytics/pivot_tables/govern

  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_q75.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_active_mus_pc_z
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_facebook_msg_per_mus
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_facebook_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_count


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_median.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_mean.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_q75.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_active_mus_pc_z
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_engage_count


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_engage_count.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_engage_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_engage_median.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_engage_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_engage_mean.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_engage_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_engage_q75.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_engage_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_engage_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_engage_active_mus_pc_z
../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_engage_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-subject_matter_simpl msg_count_twitter_engage_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-subject_matter_simpl/heatmap_bivar__accreditation-subject_matter_simpl-msg_count_twitter_engage_msg_per_mus.pdf

activity_stats_mus_groups ['region', 'subject_matter_simpl']
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/
plot_bivar_heatmap: region-subject_matter_simpl msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/heatmap_bivar__region-subject_matter_simpl-museums_n.pdf
plot_bivar_heatmap: region-subject_matter_simpl msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/heatmap_bivar__region-subject_matter_simpl-msg_count_facebook_median.pdf
plot_bivar_heatmap: region-subject_matter_simpl msg_count_facebook_mean
../../data/analysis/social_media_analytics/pivot_tables/region-subject_matter_simpl/heatmap_bivar__region-subject_matter_simpl-msg_count_facebook_mean.pdf

  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-museums_n.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_median.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_mean.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_q75.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_active_mus_pc_z
../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-size msg_count_facebook_msg_per_mus


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_facebook_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_count


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-museums_n.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_median.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_mean.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_q75.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_active_mus_pc_z
../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_msg_per_mus
../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_msg_per_mus.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_engage_count


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_engage_count.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_engage_median


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_engage_median.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_engage_mean


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_engage_mean.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_engage_q75


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_engage_q75.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_engage_active_mus_pc


  vmin = np.nanmin(calc_data)
  vmax = np.nanmax(calc_data)


../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_engage_active_mus_pc.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_engage_active_mus_pc_z
../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_engage_active_mus_pc_z.pdf
plot_bivar_heatmap: accreditation-size msg_count_twitter_engage_msg_per_mus
../../data/analysis/social_media_analytics/pivot_tables/accreditation-size/heatmap_bivar__accreditation-size-msg_count_twitter_engage_msg_per_mus.pdf

activity_stats_mus_groups ['region', 'size']
../../data/analysis/social_media_analytics/pivot_tables/region-size/
plot_bivar_heatmap: region-size msg_count_facebook_count
../../data/analysis/social_media_analytics/pivot_tables/region-size/heatmap_bivar__region-size-museums_n.pdf
plot_bivar_heatmap: region-size msg_count_facebook_median
../../data/analysis/social_media_analytics/pivot_

### Tweets/FB posts over time

#### All messages over time

Summarise tweets using pandas aggregation.

In [143]:
doc_str = """Temporal trends:
'W','D','M','Q': week, day, month, quarter

* tweets: number of tweets by museum in a given time slot
* tweets_engage: number of tweets to the museum by other users in a given time slot
* facebook_posts: number of facebook posts by museum in a given time slot
* museums_on_tw: number of museums active on Twitter in a given time slot
* museums_on_fb: number of museums active on Facebook in a given time slot
"""
write_file(doc_str, cur_folder+'data_dictionary-temporal_trends.txt')

# last valid date in the data (based on data collection from APIs)
max_date = '2021-10-01'

def format_xaxis_month_year():
    """ Format X axis for temporal data with Year/Month grid lines  """
    ax = plt.gca()
    ax.xaxis.set_major_locator(YearLocator())
    ax.xaxis.set_major_formatter(DateFormatter('%Y'))
    ax.xaxis.set_minor_locator(MonthLocator())
    ax.xaxis.set_minor_formatter(DateFormatter('%b'))
    #plt.xticks(rotation=0, horizontalalignment='right') 
    plt.grid(b=True, which='major', color='gray', linestyle='-')
    plt.grid(b=True, which='minor', color='lightgray', linestyle='--')

def generate_time_plots(df, label, aggr_func):
    """ Time plots for social media message counts """
    print('generate_time_plots', label)
    print('date range:', min(df.index), max(df.index))
    tdf = pd.DataFrame()
    df = df[df.index < max_date]
    print('max date:', max(df.index))
    for time_aggr in ['W','D','M','Q']:
        print(time_aggr)
        tmpdf = aggr_func(df, time_aggr)
        
        # plot tweets
        plt.figure(figsize=(20,10))
        sns.lineplot(data=tmpdf, x=tmpdf.index, y="count")
        # start Y from 0 to avoid exaggerating change
        plt.ylim(0, max(tmpdf['count'])*1.1)
        plt.title('All museums - N={} - {} over time by {}'.format(len(df), label, time_aggr))
        plt.xlabel('Time')
        plt.ylabel('N  '+label)
        # set ticks for each month
        format_xaxis_month_year()
        # save figures
        fig_fn = cur_folder+'{}_temporal-all_mus-{}.pdf'.format(label,time_aggr)
        plt.savefig(fig_fn)
        print(fig_fn)
        df_fn = cur_folder+'{}_temporal-all_mus-{}.xlsx'.format(label,time_aggr)
        #plt.show()
        plt.clf(); plt.cla(); plt.close()

        # add data
        tmpdf = tmpdf.reset_index()
        tdf.append(tmpdf)
        
        tmpdf2 = tmpdf.copy()
        tmpdf2['ts'] = tmpdf2['ts'].dt.tz_localize(None)
        #display(tmpdf2)
        tmpdf2.to_excel(df_fn, index=False)
        del tmpdf2
    return tdf

def count_messages(df, time_aggr):
    tmpdf = df.resample(time_aggr).size().to_frame('count')
    tmpdf['time_unit'] = time_aggr
    return tmpdf

In [112]:
# generate tweet plots
df = pd.read_pickle(tw_fn)
print(len(df))
print(df.sample(10))
generate_time_plots(df[df.from_museum], 'tweets', count_messages)
generate_time_plots(df[~df.from_museum], 'tweets_engage', count_messages)

# generate facebook plots
df = pd.read_pickle(fb_fn)
print(len(df))
print(df.sample(10))
generate_time_plots(df, 'facebook_posts', count_messages)

3351120
                                museum_id   museum_account   author_account  \
ts                                                                            
2021-02-08 01:40:01+00:00  mm.domus.SE067           wabbey     MALLOFTHEUSA   
2020-02-21 19:03:57+00:00  mm.domus.SC264      nacheritage      NACHeritage   
2021-04-17 11:35:38+00:00     mm.ace.1152       blistshill       blistshill   
2020-12-25 08:00:00+00:00  mm.domus.SW070     museumandart     MuseumandArt   
2019-02-03 10:54:20+00:00  mm.domus.SE192    bexhillmuseum    bexhillmuseum   
2019-09-12 09:30:12+00:00  mm.domus.SE049      stjohnsgate      StJohnsGate   
2019-03-09 19:36:53+00:00     mm.musa.145             gwsr             GWSR   
2021-04-07 15:33:31+00:00     mm.wiki.422  museumfreederry  MuseumFreeDerry   
2019-03-18 08:01:17+00:00  mm.domus.EM084  northamptonshoe  NorthamptonShoe   
2021-09-15 10:43:00+00:00      mm.MDN.006  northlincmuseum  Northlincmuseum   

                           is_reply  \
ts  

In [188]:
#### Messages over time by attribute

Facebook and Twitter message stats per month/week by attribute (governance, size, etc).

In [None]:
doc_str = """Temporal trends by attribute:
'W','D','M','Q': week, day, month, quarter

Attributes: museum size, museum governance, museum governance simplified, region

* count: number of tweets/messages by museums in a given group and time slot
* count_mean: mean number of messages per museum in a given group and time slot
* n_museums: museums that have posted at least 1 message in a given group and time slot
"""
write_file(doc_str, cur_folder+'pivot_over_time/data_dictionary-pivot_over_time.txt')

def plot_social_activity_by_attribute_over_time(msg_df, platform, time_unit, metric, attr):
    print('>>>', platform, time_unit, metric, attr)
    plt.figure(figsize=(20,16))
    leg = []
    outdf = []
    linestyles = ['-','--','-.']
    # generate unified plots
    for v, subdf in msg_df.groupby(attr):
        try:
            tdf = subdf.resample(time_unit).size().to_frame('count')
        except:
            print('warning: too few data points, skipping this case!')
            continue
        # calc active museums
        active_df = subdf[['museum_id']].resample(time_unit).nunique().rename(columns={'museum_id':'n_museums'})
        tdf = tdf.join(active_df)
        # calc mean count
        tdf['mean_count'] = round(tdf['count'] / len(subdf),4)
        tdf['attribute'] = attr
        tdf['attr_val'] = v
        tdf['time_unit'] = time_unit
        outdf.append(tdf)
        bPlot = True
        if 'unknown' in v.lower(): bPlot = False
        if bPlot:
            sns.lineplot(data=tdf, x=tdf.index, y=metric, linestyle=random.choice(linestyles))
            leg.append(attr + '=' + v)

    plt.title("{} {} by museum {} - time unit: {}".format(platform.upper(), metric, attr.upper(), time_unit))
    plt.legend(leg, loc="upper right")
    format_xaxis_month_year()
    plt.xlabel('Time')
    plt.ylabel(metric)
    fig_fn = cur_folder+'pivot_over_time/pivot_over_time-{}_by_{}-{}-{}.pdf'.format(platform, attr, time_unit, metric)
    plt.savefig(fig_fn)
    print(fig_fn)
    outdf = pd.concat(outdf).drop_duplicates()
    outdf['ts'] = outdf.index.tz_localize(None)
    
    if metric == 'count':
        outdf.to_excel(cur_folder+'pivot_over_time/pivot_over_time-{}_by_{}-{}-data.xlsx'.format(platform, attr, time_unit), index=False)
    del outdf

    
for plat in ['facebook','twitter']:
    if plat == 'facebook':
        df = pd.read_pickle(fb_fn)
    else:
        df = pd.read_pickle(tw_fn)
    print(plat, 'N =', len(df))
    df['ts'] = df.index
    df = df.merge(mus_df, on='museum_id', how='outer')
    df = df.set_index('ts')
    
    for met in ['mean_count','count','n_museums']:
        for time_unit in ['M','W']:
            for attr in ['size','governance_simpl','governance','region']: #,'governance','governance_simpl']:
                plot_social_activity_by_attribute_over_time(df, plat, time_unit, met, attr)

#### Active museums over time

Count museums that are active on FB/TW in a give period

In [192]:
def count_museums(df, time_aggr):
    df = df[['museum_id']]
    tmpdf = df.resample(time_aggr).nunique()
    tmpdf.columns = ['count']
    tmpdf['time_unit'] = time_aggr
    return tmpdf

# generate tweet plots
df = pd.read_pickle(tw_fn)
print(len(df))
generate_time_plots(df[df.from_museum], 'museums_on_tw', count_museums)

# generate facebook plots
df = pd.read_pickle(fb_fn)
print(len(df))
generate_time_plots(df, 'museums_on_fb', count_museums)

3351120
generate_time_plots museums_on_tw
date range: 2019-01-01 00:00:00+00:00 2021-10-01 00:00:00+00:00
max date: 2021-09-30 23:18:18+00:00
W
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-W.pdf
D
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-D.pdf
M
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-M.pdf
Q
../../data/analysis/social_media_analytics/museums_on_tw_temporal-all_mus-Q.pdf
1187529
generate_time_plots museums_on_fb
date range: 2019-01-01 00:00:00+00:00 2021-09-30 23:46:32+00:00
max date: 2021-09-30 23:46:32+00:00
W
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-W.pdf
D
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-D.pdf
M
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-M.pdf
Q
../../data/analysis/social_media_analytics/museums_on_fb_temporal-all_mus-Q.pdf


### Daily heatmaps

- Calplot package: https://www.analyticsvidhya.com/blog/2021/02/visualization-in-time-series-using-heatmaps-in-python/

In [14]:
!pip install calplot

Collecting calplot
  Downloading calplot-0.1.7.3-py3-none-any.whl (8.1 kB)
Installing collected packages: calplot
Successfully installed calplot-0.1.7.3


In [11]:
import calplot

def plot_daily_heatmap(df, label):
    # count tweets per day
    cmap = None
    cdf = df[['museum_id']].resample('D').count()
    ax = calplot.calplot(cdf['museum_id'], yearlabels=True)
    fig_fn = cur_folder+'daily_heatmap-all_mus_heatmap_{}-msg.pdf'.format(label)
    plt.savefig(fig_fn)
    plt.clf()
    del cdf
    # count active museums per day
    mdf = df[['museum_id']].resample('D').nunique()
    ax = calplot.calplot(mdf['museum_id'], yearlabels=True)
    fig_fn = cur_folder+'daily_heatmap-all_mus_heatmap_{}-active_mus.pdf'.format(label)
    #plt.tight_layout()
    plt.savefig(fig_fn)
    print(fig_fn)
    plt.clf()
    del mdf

df = pd.read_pickle(tw_fn)
plot_daily_heatmap(df[df.from_museum], 'twitter')
plot_daily_heatmap(df[~df.from_museum], 'twitter_engage')

df = pd.read_pickle(fb_fn)
plot_daily_heatmap(df, 'facebook')

  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(


../../data/analysis/social_media_analytics/daily_heatmap-all_mus_heatmap_twitter-active_mus.pdf


  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(


../../data/analysis/social_media_analytics/daily_heatmap-all_mus_heatmap_twitter_engage-active_mus.pdf


  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(
  ax.set_xticks([by_day.loc[pd.Timestamp(


../../data/analysis/social_media_analytics/daily_heatmap-all_mus_heatmap_facebook-active_mus.pdf


<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

<Figure size 900x367.2 with 0 Axes>

End of notebook.