# Museums in the Pandemic - Social media analytics

**Authors**: Andrea Ballatore (KCL)

**Abstract**: Analysis of social media data

## Setup
This is to check that your environment is set up correctly (it should print 'env ok', ignore warnings).

In [1]:
# Test geospatial libraries
# check environment
import os
print("Conda env:", os.environ['CONDA_DEFAULT_ENV'])
if os.environ['CONDA_DEFAULT_ENV'] != 'mip_v1':
    raise Exception("Set the environment 'mip_v1' on Anaconda. Current environment: " + os.environ['CONDA_DEFAULT_ENV'])

# spatial libraries 
import pandas as pd
import pickle
import spacy
from termcolor import colored
import matplotlib.dates as mdates
import sys
import numpy as np
from numpy import arange
#import tensorflow as tf
from bs4 import BeautifulSoup
from bs4.element import Comment
#import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
%matplotlib inline

# import from `mip` project
print(os.getcwd())
fpath = os.path.abspath('../')
if not fpath in sys.path:
    sys.path.insert(0, fpath)

out_folder = '../../'

from museums import *
from utils import _is_number
from analytics.text_models import derive_new_attributes_matches, get_all_matches_from_db, get_indicator_annotations
from museums import get_museums_w_web_urls

print('env ok')

Conda env: mip_v1
/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/museums-in-the-pandemic/mip/notebooks
env ok


## Connect to DB

It needs the DCS VPN active to work.

In [13]:
# open connection to DB
from db.db import connect_to_postgresql_db

db_conn = connect_to_postgresql_db()
print("DB connected")

DB connected


## Temporal analysis of Twitter/Facebook

Temporal analysis based on message counts.

In [3]:
# load museums
df = get_museums_w_web_urls(out_folder)
print("museums url N:",len(df))
attr_df = load_input_museums_wattributes(out_folder)
df = pd.merge(df, attr_df, on='muse_id', how='left')
print("museum df with attributes: len", len(df))
mus_df = df
del df
len(mus_df)

museums urls: ../../data/museums/museum_websites_urls-v3.tsv
nationaltrust.org.uk       179
english-heritage.org.uk     52
no_resource.                33
visitscotland.com           24
nts.org.uk                  21
                          ... 
glynnvivian.co.uk            1
glynde.co.uk                 1
gwsr.com                     1
gloucesterquays.co.uk        1
smithsonfarm.co.uk           1
Name: domain, Length: 2441, dtype: int64
get_museums_w_web_urls Museums=3344 URLs=3344
museums url N: 3344
Index(['musname', 'muse_id', 'town', 'postcode', 'accreditation', 'governance',
       'size', 'subject_matter', 'closing_date', 'provenance',
       'deprivation_index', 'geodemographic_group', 'geodemographic_subgroup',
       'admin_area'],
      dtype='object')
loaded museums w attributes (open): 3341 data/museums/museums_wattributes-2020-02-23.tsv
museum df with attributes: len 3344


3344

### Download Twitter/Facebook from DB

In [14]:
sql = "select muse_id as museum_id, account, tw_ts as ts from twitter.tweets_dump td;" # limit 10000
df = pd.read_sql(sql, db_conn)
df['ts'] = pd.to_datetime(df['ts'])
print(df.columns)
df = df.set_index('ts')
tw_fn = out_folder+'tmp/tweets.pik'
df.to_pickle(tw_fn)
print(len(df))
tw_fn

Index(['museum_id', 'account', 'ts'], dtype='object')
5653012


'../../tmp/tweets.pik'

In [5]:
sql = "select museum_id, post_ts as ts from facebook.facebook_posts_dump;" # limit 10000
df = pd.read_sql(sql, db_conn)
fb_fn = out_folder+'tmp/facebook_posts.pik'
df['post_ts'] = pd.to_datetime(df['post_ts'])
print(df.columns)
df = df.set_index('post_ts')
df.to_pickle(fb_fn)
print(len(df))
fb_fn

Index(['museum_id', 'post_ts'], dtype='object')
1468515


'../../tmp/facebook_posts.pik'

### All tweets/posts over time

Summarise tweets using pandas aggregation.

In [57]:
def generate_time_plots(df, label, aggr_func):
    print('generate_time_plots', label)
    print('date range:', min(df.index), max(df.index))
    tdf = pd.DataFrame()
    for time_aggr in ['W','D','M','Q']:
        print(time_aggr)
        tmpdf = aggr_func(df, time_aggr)

        # plot tweets
        fig, ax = plt.subplots()
        tmpdf['count'].plot(linewidth=1, figsize=(20,7))
        plt.title('All museums - N={} - {} over time by {}'.format(len(df), label, time_aggr))
        plt.xlabel('Time')
        plt.grid(True, which='both')
        plt.ylabel('N  '+label)
        #ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
        fig_fn = out_folder+'data/analysis/temporal_analysis/{}_temporal-all_mus-{}.pdf'.format(label,time_aggr)
        plt.savefig(fig_fn)
        print(fig_fn)
        #plt.show()
        
        plt.clf(); plt.cla(); plt.close()

        # add data
        tmpdf = tmpdf.reset_index()
        tdf.append(tmpdf)

    return tdf

def count_messages(df, time_aggr):
    tmpdf = df.resample(time_aggr).size().to_frame('count')
    tmpdf['time_unit'] = time_aggr
    return tmpdf

# generate tweet plots
df = pd.read_pickle(tw_fn)
print(len(df))
print(df.sample(10))
generate_time_plots(df, 'tweets', count_messages)

# generate facebook plots
df = pd.read_pickle(fb_fn)
print(len(df))
print(df.sample(10))
generate_time_plots(df, 'facebook_posts', count_messages)

5653012
                                museum_id         account
ts                                                       
2020-07-30 12:03:05+00:00  mm.domus.SE309  hornimanmuseum
2020-06-12 18:51:18+00:00  mm.domus.WM094   mjqbirmingham
2020-08-17 19:11:40+00:00     mm.aim.0245  croydonairport
2020-05-19 15:30:45+00:00  mm.domus.SE290     guildfordbc
2019-08-07 14:00:49+00:00  mm.domus.WM068   sarehole_mill
2020-02-04 12:22:06+00:00  mm.domus.SE079         kent_cc
2020-06-24 08:01:19+00:00  mm.domus.SC281  somersetmuseum
2021-02-09 11:53:21+00:00  mm.domus.EM034    derbymuseums
2021-08-21 09:18:47+00:00      mm.hha.190     cressellycc
2020-05-24 16:35:03+00:00  mm.domus.SE511       rafmuseum
generate_time_plots
date range: 2019-01-01 00:00:00+00:00 2021-10-15 12:45:39+00:00
W
../../data/analysis/temporal_analysis/tweets_temporal-all_mus-W.pdf
D
../../data/analysis/temporal_analysis/tweets_temporal-all_mus-D.pdf
M
../../data/analysis/temporal_analysis/tweets_temporal-all_mus-M.pdf
Q


### Active museums over time

Count museums that are active on FB/TW in a give period

In [1]:
def count_museums(df, time_aggr):
    df = df[['museum_id']]
    tmpdf = df.resample(time_aggr).nunique()
    tmpdf.columns = ['count']
    tmpdf['time_unit'] = time_aggr
    return tmpdf

# generate tweet plots
df = pd.read_pickle(tw_fn)
print(len(df))
generate_time_plots(df, 'museums_on_tw', count_museums)

# generate facebook plots
df = pd.read_pickle(fb_fn)
print(len(df))
generate_time_plots(df, 'museums_on_fb', count_museums)

TODO: heatmap https://www.analyticsvidhya.com/blog/2021/02/visualization-in-time-series-using-heatmaps-in-python/

SyntaxError: invalid syntax (3252168005.py, line 18)

### Activity by museum
TODO

In [None]:
TODO
for index, row in mus_df.sample(frac=1).iterrows():
    muse_id = row['muse_id']
    
    #df = pd.read_sql(sql, db_conn)

### Activity by museum groups
TODO

End of notebook.