In [1]:
import datetime
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

from pylab import cm
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

from bigbang.analysis.listserv import ListservMailList
#from bigbang.bigbang_io import pandas_dataframe_to_mbox

from tgpp.config.config import CONFIG
import tgpp.ingress.queries as Queries

plt.style.use("./publication_jcap.mplstyle")

In [2]:
def format_dictionary(dic: dict, threshold: Union[int, float]) -> dict:
    """
    Format dictionary for pie plot.
    """
    dic_sorted = {
        list(dic.keys())[indx]: list(dic.values())[indx]
        for indx in np.argsort(list(dic.values()))[::-1]
    }

    percentages = ListservMailList.to_percentage(list(dic_sorted.values()))
    dic_sorted = {key: value for key, value in zip(dic_sorted.keys(), percentages)}

    dic_filtered = {"others": 0}
    
    if isinstance(threshold, int):
        for index, (key, value) in enumerate(dic_sorted.items()):
            if index <= threshold:
                dic_filtered[key] = value
            else:
                dic_filtered["others"] += value
    elif isinstance(threshold, float):
        for key, value in dic_sorted.items():
            if value >= threshold:
                dic_filtered[key] = value
            else:
                dic_filtered["others"] += value
    return dic_filtered

## Load data

In [3]:
# Load data on organisations in 3GPP

file_path = "/Users/christovis/Documents/InternetGovernance/bigbang/examples/organizations/organization_categories.csv"
df_organization_categories = pd.read_csv(
    file_path,
    sep=",",
    header=0,
    index_col=False,
)
# choose columns of interest
df_nation = df_organization_categories[[
    "name",
    "nationality",
    "email domain names",
]]
# filter out all non-empty cells
df_nation = df_nation.dropna()

indices = [
    idx
    for idx, row in df_nation.iterrows()
    if len(row['email domain names'].split(',')) > 1
]    

df_nation_multi = df_nation.loc[indices]
df_nation_single = df_nation.drop(indices)

_df_nation_multi = defaultdict(list)
for idx, row in df_nation_multi.iterrows():
    _row = row
    email_domains = row['email domain names'].split(',')
    for email_domain in email_domains:
        _row['email domain names'] = email_domain.strip()
        for key, value in _row.to_dict().items():
            _df_nation_multi[key].append(value)
df_nation = pd.concat([df_nation_single, pd.DataFrame.from_dict(_df_nation_multi)], ignore_index=True)

In [56]:
# Load Target-set

mlist_name = "3GPP_TSG_SA_WG3_LI"

# load target-set (Tset)
tset = pd.read_hdf(
    CONFIG.folder_target_set + f"{mlist_name}.h5",
    key="df",
    header=0,
    index_col=0,
)
tset = tset.dropna()
tset['msg-date'] =  pd.to_datetime(tset['msg-date'], format="%a, %d %b %Y %H:%M:%S %z")

## Select keyterm & Add nationality

In [57]:
keyterm = "malici"
col_with_keyterm = [col for col in tset.columns if keyterm in col]
# remove rows containing NaNs in keyterm-column
tset_kt = tset[tset[col_with_keyterm].sum(axis=1) != 0]
tset_kt = tset_kt.fillna(0)

In [73]:
# Add nationality to sender in target-set

tset_kt['msg-nationality'] = np.nan
for idx, row in tset_kt.iterrows():
    _, _, domain = ListservMailList.get_name_localpart_domain(row['msg-from'])
    nationality = df_nation[df_nation['email domain names'] == domain]['nationality'].values
    if len(nationality) > 0:
        # TODO: there are some domain names associated to multiple nationailies
        tset_kt.loc[idx, 'msg-nationality'] = nationality[0]
tset_kt = tset_kt.dropna()
tset_kt = Queries.remove_text_wo_query(tset_kt)
tset_kt = Queries.remove_query_wo_text(tset_kt)

## Split into header, body and attachment

In [65]:
# get Tset Email header fields
non_query_columns = [col for col in tset_kt.columns if col.startswith('msg-')]
df_msg = tset_kt.loc[:, tset_kt.columns.isin(non_query_columns)]

# get Tset Email keyterms
df_qu = tset_kt.loc[:, ~tset_kt.columns.isin(non_query_columns)]
body_query_columns = [col for col in df_qu.columns if col.startswith('body-')]
attachment_query_columns = [col for col in df_qu.columns if col.startswith('attachment-')]
df_qu_bo = tset_kt.loc[:, body_query_columns]
df_qu_at = tset_kt.loc[:, attachment_query_columns]

df_qu_bo = Queries.remove_text_wo_query(df_qu_bo, reset_index=False)
df_qu_bo = Queries.remove_query_wo_text(df_qu_bo, reset_index=False)

df_qu_at = Queries.remove_text_wo_query(df_qu_at, reset_index=False)
df_qu_at = Queries.remove_query_wo_text(df_qu_at, reset_index=False)

In [74]:
df_qu_bo

Unnamed: 0,body- access,body- iri,body- lmisf,body- point intercept,body- protect,body- requir
2,1,1,4,1,14,5


In [75]:
df_qu_at

Unnamed: 0,attachment- access,attachment- access gateway,attachment- access provid,attachment- agw,attachment- ap,attachment- authent,attachment- basic encod rule,attachment- ber,attachment- call session control function,attachment- cc,...,attachment- nid,attachment- poi,attachment- point intercept,attachment- protect,attachment- quic,attachment- requir,attachment- secur,attachment- surveil,attachment- tls,attachment- transport layer secur
0,33,0,6,0,3,1,3,7,0,51,...,3,56,4,23,0,115,64,3,24,10
1,0,0,0,0,0,0,0,4,0,7,...,0,0,0,7,1,15,8,0,0,0
2,4,1,0,5,1,1,0,0,1,0,...,0,0,0,8,0,3,15,0,0,0


In [76]:
tset_kt.loc[2, 'msg-archived-at']

'<https://list.etsi.org/scripts/wa.exe?A2=3GPP_TSG_SA_WG3_LI;20fa3cf6.1609C&S=>'

In [26]:
tset[['body- protect ', 'msg-archived-at']].loc[3, :].values

array([8,
       '<https://list.etsi.org/scripts/wa.exe?A2=3GPP_TSG_SA_WG3_LI;70d545d0.2202C&S=>'],
      dtype=object)