In [1]:
import datetime
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

import pycountry
import networkx as nx

from pylab import colorbar
from pylab import cm
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

from bigbang.analysis.listserv import ListservMailList
#from bigbang.bigbang_io import pandas_dataframe_to_mbox
from bigbang.visualisation import graphs
from bigbang.datasets import organizations

from tgpp.config.config import CONFIG
import tgpp.nlp.utils as NLPutils
import tgpp.ingress.queries as Queries
from tgpp.ingress import organizations as ORGA

plt.style.use("./publication_jcap.mplstyle")

## Load data

In [2]:
category_abbr_dict = {
    'Advertising company': 'Adverts',
    'Chipmaker': 'Chipmaker',
    'Civil Society Organization': 'Civic Org',
    'Cloud Provider': 'Cloud Prov',
    'Consulting': 'Consulting',
    'Consumer hardware and software vendor': 'Consumer Vendor',
    'Content Distribution Network': 'Content Distr Net',
    'Content Provider': 'Content Prov',
    'Cybersecurity': 'Cybersec',
    'Database Provider': 'DB Prov',
    'Financial Institution': 'Financial Inst',
    'Hackerspace': 'Hackerspace',
    'Hardware Developer': 'Hardware Dev',
    'Hardware Provider': 'Hardware Prov',
    'IETF secretariat': 'IETF',
    'Infrastructure Company': 'Infra Company',
    'Internet Governance Body': 'Internet Gov',
    'Internet Registry': 'Internet Registry',
    'Law Enforcement Agency': 'LEA',
    'Networking equipment vendor': 'Net Vendor',
    'Networking Service Provider': 'Net Serv Prov',
    'Real time video': 'Real time video',
    'Regional Standards Body': 'Regi Standards',
    'Regulatory Body': 'Regu Body',
    'Research Institution': 'Research Inst',
    'Satellite Communications Provider': 'Sat Comm Prov',
    'Software Provider': 'Software Prov',
    'Software developer': 'Software Dev',
    'Space Agency': 'Space Agency',
    'Standards Body': 'Standards',
    'Technology research and development company': 'Tech Res and Dev',
    'Telecommunications Provider': 'Telecomm Prov',
    'Testing and Certification': 'Testing and Certification',
}

In [3]:
# Load data on organisations in 3GPP

df_orgcat = ORGA.load_data()
df_orgcat = ORGA.expand_rows_with_multiple_entries(df_orgcat, column='email domain names')
df_orgcat = ORGA.assign_parent_nationality(df_orgcat)
df_orgcat = ORGA.remove_leading_and_trailing_whitespaces(df_orgcat)

# choose columns of interest
df_nation = df_orgcat[[
    "name",
    "nationality",
    "email domain names",
]]
# filter out all non-empty cells
df_nation = df_nation.dropna()

# choose columns of interest
df_seccat = df_orgcat[[
    "name",
    "category",
    "email domain names",
]]
# filter out all non-empty cells
df_seccat = df_seccat.dropna(subset=["category", "email domain names"])
df_seccat = ORGA.expand_rows_with_multiple_entries(df_seccat, column='category')
# use abbriviation for the figures
df_seccat["category"] = df_seccat["category"].apply(lambda x: category_abbr_dict[x])

In [4]:
# Load Search-set (Sset)
#mlist_name = "3GPP_TSG_SA_WG3_LI"
mlist_name = "3GPP_TSG_SA_WG3"

sset_mlist = ListservMailList.from_mbox(
    name=mlist_name,
    filepath=CONFIG.folder_search_set + f"{mlist_name}.mbox",
)
init_sset_mlist_len = len(sset_mlist)
print(init_sset_mlist_len)

# only keep rows that have the header field 'from'
sset_mlist.df = sset_mlist.df.dropna(subset=['from'])

# only keep rows that have the header field 'date'
#mlist.df = mlist.df.dropna(subset=['date'])

sset_mlist.df['date'] = pd.to_datetime(sset_mlist.df['date'], format="%a, %d %b %Y %H:%M:%S %z", errors="coerce")
print(len(sset_mlist))

year_of_first_msg = np.min(sset_mlist.period_of_activity()).year
year_of_last_msg = np.max(sset_mlist.period_of_activity()).year

51445
51184


In [5]:
print(
    "S-set:",
    np.min(sset_mlist.period_of_activity()),
    np.max(sset_mlist.period_of_activity()),
)

S-set: 1999-01-08 07:47:55+01:00 2021-10-22 12:25:35+00:00


In [6]:
# Load Target-set

mlist_name = "3GPP_TSG_SA_WG3_LI"

# load target-set (Tset)
tset_df = pd.read_hdf(
    CONFIG.folder_target_set + f"{mlist_name}.h5",
    key="df",
    header=0,
    index_col=0,
)

non_query_columns = [col for col in tset_df.columns if col.startswith('msg-')]
tset_df = tset_df.loc[:, tset_df.columns.isin(non_query_columns)]
relable = {col: col.replace('msg-', '') for col in tset_df.columns}
tset_df = tset_df.rename(columns=relable, errors="raise")

tset_mlist = ListservMailList.from_pandas_dataframe(
    df=tset_df,
    name=mlist_name,
    filepath=CONFIG.folder_target_set + f"{mlist_name}.h5",
)
init_tset_mlist_len = len(tset_mlist)
print(init_tset_mlist_len)

# only keep rows that have the header field 'from'
tset_mlist.df = tset_mlist.df.dropna(subset=['from'])

# only keep rows that have the header field 'date'
#tset_mlist.df = tset_mlist.df.dropna(subset=['date'])

tset_mlist.df['date'] = pd.to_datetime(
    tset_mlist.df['date'],
    format="%a, %d %b %Y %H:%M:%S %z",
    errors="coerce",
)
print(len(tset_mlist))

year_of_first_msg = np.min(tset_mlist.period_of_activity()).year
year_of_last_msg = np.max(tset_mlist.period_of_activity()).year

3177
3177


In [7]:
print(
    "T-set:",
    np.min(sset_mlist.period_of_activity()),
    np.max(sset_mlist.period_of_activity()),
)

T-set: 1999-01-08 07:47:55+01:00 2021-10-22 12:25:35+00:00


In [8]:
# Select to analyse T-set or S-set
set_lable = 'sset'
mlist = sset_mlist

## Add Nationality, Category, Stakeholder to Email Sender

In [9]:
# Add stakeholdergroup to sender

mlist.df['from_category'] = 'Unkown'
for idx, row in mlist.df.iterrows():
    _, _, domain = ListservMailList.get_name_localpart_domain(row['from'])
    category = df_seccat[df_seccat['email domain names'] == domain]['category'].values
    if len(category) > 0:
        try:
            mlist.df.loc[idx, 'from_category'] = category[0].strip()
        except:
            print(idx, category)

In [10]:
# Add stakeholdergroup to sender

mlist.df['from_stakeholdergroup'] = 'Unkown'
for idx, row in mlist.df.iterrows():
    _, _, domain = ListservMailList.get_name_localpart_domain(row['from'])
    stakeholdergroup = df_orgcat[df_orgcat['email domain names'] == domain]['stakeholdergroup'].values
    if len(stakeholdergroup) > 0:
        try:
            mlist.df.loc[idx, 'from_stakeholdergroup'] = stakeholdergroup[0].strip()
        except:
            continue
        #print(idx, stakeholdergroup)

In [11]:
# Add nationality to sender in

mlist.df['from_nationality'] = 'Unkown'
for idx, row in mlist.df.iterrows():
    #print(row['from'])
    _, _, domain = ListservMailList.get_name_localpart_domain(row['from'])
    nationality = df_nation[df_nation['email domain names'] == domain]['nationality'].values
    if len(nationality) > 0:
        try:
            mlist.df.loc[idx, 'from_nationality'] = nationality[0].strip()
        except:
            print(idx, nationality, nationality[0])

In [12]:
mlist_1 = mlist.crop_by_year(yrs=[2000, 2020])
mlist_2 = mlist.crop_by_year(yrs=[2020, 2022])

In [13]:
keyterms = {
    'corona': 0,
    'covid': 0,
    'pandemic': 0,
    'epidemic': 0,
    'wuhan': 0,
    'china': 0,
    'teams': 0,
    'zoom': 0,
    'skype': 0,
    'jitsi': 0,
    'video-conferenc': 0,
}

min_len = np.min([len(kt) for kt in keyterms.keys()])
max_len = np.max([len(kt) for kt in keyterms.keys()])

for msg_idx, msg in mlist_2.df.iterrows():
    text = NLPutils.text_preprocessing(
        msg['body'],
        min_len=2,
        max_len=40,
        keep_nonalphanumerics=['-'],
        remove_numbers=True,
        do_lemmatize=True,
        do_stemming=False,
        return_tokens=False,
    )
    for keyterm in keyterms.keys():
        keyterms[keyterm] = text.count(keyterm)
    

In [14]:
keyterms

{'corona': 0,
 'covid': 0,
 'pandemic': 0,
 'epidemic': 0,
 'wuhan': 0,
 'china': 0,
 'teams': 0,
 'zoom': 0,
 'skype': 0,
 'jitsi': 0,
 'video-conferenc': 0}

In [15]:
text

'nmy suggestion proceed isn xe ipsec nds inter plmn protection xe intruder inject traffic ul dl pdu session min sa people present security requirements ipupf discuss sa view architecture sa assume ipupf functionality clear rest discussion moot min xe ll briefly describe architecture discuss sa basically slide attach min give floor people defend solutions min seek conclusions main question prepare potential show hand nthis conclude january meet feature exception complete stage deadline nbest nlaurent nfrom thiebaut laurent nokia fr paris-saclay nsent monday december pm nto log unmask nsubject fw gpp roam security conf call options nforwarding sa list nbest nlaurent nfrom gpp tsg sa wg tsg sa security log unmask behalf steve kohalmi nsent friday december pm nto log unmask nsubject gpp roam security conf call options nsorry mess timezones nso time https doodle poll qq gyxc gnmqcdf https urldefense https nam safelinks protection outlook url https fdoodle fpoll fqq gyxc gnmqcdf data cchris 

In [16]:
mlist_2.df.loc[2869, 'body']

"b'Hello all,\\n\\nCover page: sorry, but the field \\xe2\\x80\\x9cconsequences if not approved\\xe2\\x80\\x9d is compulsory for cat-F CRs.\\n\\nRegards\\n\\nMirko Cano Soveri \\xe2\\x80\\x93 Technical Officer\\nETSI \\xe2\\x97\\x8f www.etsi.org<http://www.etsi.org/> \\xe2\\x97\\x8f mirko.cano@etsi.org<mailto:mirko.cano@etsi.org>\\nPhone: +33 (0)4 92 94 42 97 \\xe2\\x97\\x8f Mobile: +33 (0)6 73 99 62 94\\nWatch our new ETSI video \\xe2\\x80\\x9cMEC \\xe2\\x80\\x93 Close to the user, at the EDGE!<https://youtu.be/crnPWql-0oo>\\xe2\\x80\\x9d\\n\\n[Text, whiteboard  Description automatically generated]<https://youtu.be/crnPWql-0oo>\\n\\n'"