In [1]:
import datetime
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

import pycountry
import networkx as nx

from pylab import colorbar
from pylab import cm
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

from bigbang.analysis.listserv import ListservMailList
#from bigbang.bigbang_io import pandas_dataframe_to_mbox
from bigbang.visualisation import graphs
from bigbang.datasets import organizations

from tgpp.config.config import CONFIG
import tgpp.ingress.queries as Queries
from tgpp.ingress import organizations as ORGA

plt.style.use("./publication_jcap.mplstyle")

## Load data

In [8]:
# Load data on organisations in 3GPP

df_orgcat = ORGA.load_data()
df_orgcat = ORGA.expand_rows_with_multiple_entries(df_orgcat, column='email domain names')
df_orgcat = ORGA.assign_parent_nationality(df_orgcat)
df_orgcat = ORGA.remove_leading_and_trailing_whitespaces(df_orgcat)

# choose columns of interest
df_nation = df_orgcat[[
    "name",
    "nationality",
    "email domain names",
]]
# filter out all non-empty cells
df_nation = df_nation.dropna()

# choose columns of interest
df_seccat = df_orgcat[[
    "name",
    "category",
    "email domain names",
]]
# filter out all non-empty cells
df_seccat = df_seccat.dropna(subset=["category", "email domain names"])
df_seccat = ORGA.expand_rows_with_multiple_entries(df_seccat, column='category')

In [9]:
# Load Search-set (Sset)
mlist_name = "3GPP_TSG_SA_WG3_LI"

folder_search_set = '/Users/christovis/Documents/InternetGovernance/bigbang-archive/3GPP/'
sset_mlist = ListservMailList.from_mbox(
    name=mlist_name,
    #filepath=CONFIG.folder_search_set + f"{mlist_name}.mbox",
    filepath=folder_search_set + f"{mlist_name}.mbox",
)
init_sset_mlist_len = len(sset_mlist)
print(init_sset_mlist_len)

# only keep rows that have the header field 'from'
sset_mlist.df = sset_mlist.df.dropna(subset=['from'])

# only keep rows that have the header field 'date'
#mlist.df = mlist.df.dropna(subset=['date'])

sset_mlist.df['date'] = pd.to_datetime(sset_mlist.df['date'], format="%a, %d %b %Y %H:%M:%S %z", errors="coerce")
print(len(sset_mlist))

year_of_first_msg = np.min(sset_mlist.period_of_activity()).year
year_of_last_msg = np.max(sset_mlist.period_of_activity()).year

6419
6415


In [10]:
print(
    "S-set:",
    np.min(sset_mlist.period_of_activity()),
    np.max(sset_mlist.period_of_activity()),
)

S-set: 2000-09-06 06:45:22-07:00 2022-02-26 14:29:06+00:00


In [11]:
# Load Target-set

mlist_name = "3GPP_TSG_SA_WG3_LI"

# load target-set (Tset)
tset_df = pd.read_hdf(
    CONFIG.folder_target_set + f"{mlist_name}.h5",
    key="df",
    header=0,
    index_col=0,
)

non_query_columns = [col for col in tset_df.columns if col.startswith('msg-')]
tset_df = tset_df.loc[:, tset_df.columns.isin(non_query_columns)]
relable = {col: col.replace('msg-', '') for col in tset_df.columns}
tset_df = tset_df.rename(columns=relable, errors="raise")

tset_mlist = ListservMailList.from_pandas_dataframe(
    df=tset_df,
    name=mlist_name,
    filepath=CONFIG.folder_target_set + f"{mlist_name}.h5",
)
init_tset_mlist_len = len(tset_mlist)
print(init_tset_mlist_len)

# only keep rows that have the header field 'from'
tset_mlist.df = tset_mlist.df.dropna(subset=['from'])

# only keep rows that have the header field 'date'
#tset_mlist.df = tset_mlist.df.dropna(subset=['date'])

tset_mlist.df['date'] = pd.to_datetime(
    tset_mlist.df['date'],
    format="%a, %d %b %Y %H:%M:%S %z",
    errors="coerce",
)
print(len(tset_mlist))

year_of_first_msg = np.min(tset_mlist.period_of_activity()).year
year_of_last_msg = np.max(tset_mlist.period_of_activity()).year

3177
3177


In [12]:
print(
    "T-set:",
    np.min(sset_mlist.period_of_activity()),
    np.max(sset_mlist.period_of_activity()),
)

T-set: 2000-09-06 06:45:22-07:00 2022-02-26 14:29:06+00:00


In [13]:
# Select to analyse T-set or S-set
set_lable = 'tset'
mlist = tset_mlist

## Add Nationality, Category, Stakeholder to Email Sender

In [14]:
# Add stakeholdergroup to sender

mlist.df['from_category'] = 'Unkown'
for idx, row in mlist.df.iterrows():
    _, _, domain = ListservMailList.get_name_localpart_domain(row['from'])
    category = df_seccat[df_seccat['email domain names'] == domain]['category'].values
    if len(category) > 0:
        try:
            mlist.df.loc[idx, 'from_category'] = category[0].strip()
        except:
            print(idx, category)

In [15]:
# Add stakeholdergroup to sender

mlist.df['from_stakeholdergroup'] = 'Unkown'
for idx, row in mlist.df.iterrows():
    _, _, domain = ListservMailList.get_name_localpart_domain(row['from'])
    stakeholdergroup = df_orgcat[df_orgcat['email domain names'] == domain]['stakeholdergroup'].values
    if len(stakeholdergroup) > 0:
        try:
            mlist.df.loc[idx, 'from_stakeholdergroup'] = stakeholdergroup[0].strip()
        except:
            continue
        #print(idx, stakeholdergroup)

In [16]:
# Add nationality to sender in

mlist.df['from_nationality'] = 'Unkown'
for idx, row in mlist.df.iterrows():
    #print(row['from'])
    _, _, domain = ListservMailList.get_name_localpart_domain(row['from'])
    nationality = df_nation[df_nation['email domain names'] == domain]['nationality'].values
    if len(nationality) > 0:
        try:
            mlist.df.loc[idx, 'from_nationality'] = nationality[0].strip()
        except:
            print(idx, nationality, nationality[0])

In [17]:
mlist_1 = mlist.crop_by_year(yrs=[2000, 2020])
mlist_2 = mlist.crop_by_year(yrs=[2020, 2022])

## Domain Graph

In [19]:
localparts = mlist.get_messagescount(
    header_fields=['from'],
    per_address_field=['localpart'],
    per_year=False,
)['from']

In [22]:
indices = np.argsort(list(localparts.values()))
localparts = {list(localparts.keys())[idx]: list(localparts.values())[idx] for idx in indices}

In [25]:
localparts

{'3gpplist': 1,
 'dionisio.zumerle': 1,
 'peter.musgrove': 1,
 'nalfano': 1,
 'mark.lastdrager': 1,
 'sven.jonas': 1,
 'frank.fransen': 1,
 'erich.seitz': 1,
 'john.cundall': 1,
 'frank.korinek': 1,
 'arkady_linshitz': 1,
 'sarah.werner': 1,
 'pauljbaxter': 1,
 'schramp': 1,
 'ian.cooper': 1,
 'martin.kaessens': 1,
 'jerome.gouy': 1,
 'claudio1.fusco': 1,
 'ingjo': 1,
 'gary.jones': 1,
 'peter.howard': 1,
 'brian.marcus': 1,
 'jerry.shih': 1,
 'ramantha': 1,
 'jmenard': 1,
 'etsi': 1,
 'arkadyl': 1,
 'herbert.paulis': 1,
 'janos.varro': 1,
 'jean': 1,
 'tobias.schoenberg': 2,
 'rolf.schnitzler': 2,
 'rainer.landgraf': 2,
 'michael.clayton': 2,
 'robert.ropolyi': 2,
 'mwong': 2,
 'rhys.arkins': 2,
 'erwin.foerster': 2,
 'peter.vanderarend': 2,
 'sedge': 2,
 'arjunrc': 2,
 'itsuma.tanaka.ev': 3,
 'leopold.murhammer': 3,
 'dusty.hoffpauir': 3,
 'srengasa': 3,
 'muraliv': 3,
 'alexander.retzel': 4,
 'bernie_mckibben-p17982': 4,
 'michael.hammer': 4,
 'ben.j.mitchell': 5,
 'sts.standards': 