In [1]:
import datetime
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

from pylab import cm
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

from bigbang.analysis.listserv import ListservMailList
#from bigbang.bigbang_io import pandas_dataframe_to_mbox

from tgpp.config.config import CONFIG
import tgpp.ingress.queries as Queries
from tgpp.ingress import organizations as ORGA

plt.style.use("./publication_jcap.mplstyle")

In [2]:
def format_dictionary(dic: dict, threshold: Union[int, float]) -> dict:
    """
    Format dictionary for pie plot.
    """
    dic_sorted = {
        list(dic.keys())[indx]: list(dic.values())[indx]
        for indx in np.argsort(list(dic.values()))[::-1]
    }

    percentages = ListservMailList.to_percentage(list(dic_sorted.values()))
    dic_sorted = {key: value for key, value in zip(dic_sorted.keys(), percentages)}

    dic_filtered = {"others": 0}
    
    if isinstance(threshold, int):
        for index, (key, value) in enumerate(dic_sorted.items()):
            if index <= threshold:
                dic_filtered[key] = value
            else:
                dic_filtered["others"] += value
    elif isinstance(threshold, float):
        for key, value in dic_sorted.items():
            if value >= threshold:
                dic_filtered[key] = value
            else:
                dic_filtered["others"] += value
    return dic_filtered

## Load data

In [3]:
# Load data on organisations in 3GPP

df_orgcat = ORGA.load_data()
df_orgcat = ORGA.expand_rows_with_multiple_entries(df_orgcat, column='email domain names')
df_orgcat = ORGA.assign_parent_nationality(df_orgcat)
df_orgcat = ORGA.remove_leading_and_trailing_whitespaces(df_orgcat)

# choose columns of interest
df_nation = df_orgcat[[
    "name",
    "nationality",
    "email domain names",
]]
# filter out all non-empty cells
df_nation = df_nation.dropna()

In [4]:
# Load Target-set

mlist_name = "3GPP_TSG_SA_WG3_LI"

# load target-set (Tset)
tset = pd.read_hdf(
    CONFIG.folder_target_set + f"{mlist_name}.h5",
    key="df",
    header=0,
    index_col=0,
)
tset = tset.dropna()
tset['msg-date'] =  pd.to_datetime(tset['msg-date'], format="%a, %d %b %Y %H:%M:%S %z")

## Select keyterm & Add nationality

In [5]:
keyterm = "sentence rephrase"
col_with_keyterm = [col for col in tset.columns if keyterm in col]
# remove rows containing NaNs in keyterm-column
tset_kt = tset[tset[col_with_keyterm].sum(axis=1) != 0]
tset_kt = tset_kt.fillna(0)

In [6]:
# Add nationality to sender in target-set

tset_kt['msg-nationality'] = np.nan
for idx, row in tset_kt.iterrows():
    _, _, domain = ListservMailList.get_name_localpart_domain(row['msg-from'])
    nationality = df_nation[df_nation['email domain names'] == domain]['nationality'].values
    if len(nationality) > 0:
        # TODO: there are some domain names associated to multiple nationailies
        tset_kt.loc[idx, 'msg-nationality'] = nationality[0]
tset_kt = tset_kt.dropna()
tset_kt = Queries.remove_text_wo_query(tset_kt)
tset_kt = Queries.remove_query_wo_text(tset_kt)

## Split into header, body and attachment

In [7]:
# get Tset Email header fields
non_query_columns = [col for col in tset_kt.columns if col.startswith('msg-')]
df_msg = tset_kt.loc[:, tset_kt.columns.isin(non_query_columns)]

# get Tset Email keyterms
df_qu = tset_kt.loc[:, ~tset_kt.columns.isin(non_query_columns)]
body_query_columns = [col for col in df_qu.columns if col.startswith('body-')]
attachment_query_columns = [col for col in df_qu.columns if col.startswith('attachment-')]
df_qu_bo = tset_kt.loc[:, body_query_columns]
df_qu_at = tset_kt.loc[:, attachment_query_columns]

df_qu_bo = Queries.remove_text_wo_query(df_qu_bo, reset_index=False)
df_qu_bo = Queries.remove_query_wo_text(df_qu_bo, reset_index=False)

df_qu_at = Queries.remove_text_wo_query(df_qu_at, reset_index=False)
df_qu_at = Queries.remove_query_wo_text(df_qu_at, reset_index=False)

In [8]:
df_qu_bo

In [9]:
df_qu_at

In [10]:
tset_kt.loc[2, 'msg-archived-at']

KeyError: 2

In [None]:
tset[['body- protect ', 'msg-archived-at']].loc[3, :].values