In [None]:
import sys
import json
import urllib.parse
import itertools
from typing import List, Set, Dict, Any
from pprint import pprint
from loguru import logger
import pandas as pd
from matplotlib import pyplot as plt

# filename_final_result = 'data/preprocess_output/result.json'
filename_final_result = 'data/preprocess_output/result_cz_gov.json'
# filename_final_result = 'data/preprocess_output/result_top_500.json'


def load_data():
    with open(filename_final_result, "r", encoding="utf8") as f:
        return json.load(f)

# with open('data/pyspf/raw.json', 'r', encoding='utf8') as f:
#    pyspf = json.load(f)

    
    
def cleanup_to_pd(data):
    df = pd.DataFrame(data)
    df.drop(['extract_dns_txt_dmarc', 'extract_dns_txt_spf', 'raw_result'], inplace=True)
    df = df.transpose()
    
    df.reset_index(inplace=True)
    df.rename(columns = {'index':'domain_name'}, inplace=True)
    
    # df['sort_val'] = df.loc[:,'domain_name'].apply(lambda x: x[::-1])
    df['sort_val'] = df.loc[:,'domain_name'].apply(lambda x: x.split(".")[-2:-1])


    df = df.sort_values(by='sort_val').drop('sort_val', 1)
    df.reset_index(inplace=True, drop=True)

    
    return df

raw_data = load_data()
main_data = cleanup_to_pd(raw_data)

main_data.to_csv("data/pd.csv")

main_data

## Table of second level domains

In [None]:
#second_level_domains = main_data.loc[(main_data['is_main_domain'])]
#second_level_domains
main_data.loc[(main_data['is_main_domain'])]

## Table of domains (all levels) which have DNS MX record.

In [None]:
has_mx_records = main_data.loc[(main_data['has_mx_record'])]
has_mx_records

## Table of second level domains which have DNS MX record.

Theses domains will be for the most part core of the analysis.

In [None]:
second_level_domains_with_mx = main_data.loc[(main_data['is_main_domain']) & (main_data['has_mx_record'])]
second_level_domains_with_mx
# main_data.loc[(main_data['is_main_domain']) & (main_data['has_mx_record'])]

## Table of domains, that have SPF, but are not second level domains

These domains will mostly be ignore for the rest of this analysis.

In [None]:
# Has SPF, but is not second level domain
main_data.loc[(main_data['is_main_domain'] == False) & (main_data['is_main_domain_or_spf_or_dmarc'])]

In [None]:
#second_level_domains_with_mx.loc[(second_level_domains_with_mx['has_spf_record']==False) & (second_level_domains_with_mx['has_dmarc_record'])]

In [None]:
main_data_backup = main_data
data_table = second_level_domains_with_mx

del main_data
del second_level_domains_with_mx

data_table

***

# Only domains with MX record bellow here

## No-SPF vs {?, ~, -}all SPF

In [None]:
#second_level_domains_with_mx.groupby(['spf_all']).agg(['count'])
all_policy = data_table.value_counts("spf_all")
all_policy

In [None]:
# plt.style.use('dark_background')
print(all_policy)
all_policy.plot.pie(
    ylabel="Default policy (all)",
    radius=1,
    autopct='%1.1f%%',
    figsize=(10,10)
).legend(prop={'size': 20})

## Table of domains that included using include:domain

In [None]:
has_direct_include = data_table.loc[(data_table['direct_includes'].str.len() > 0)]
has_direct_include

In [None]:
def calcalate_freq_of_direct_includes(data):
    direct_includes_arr = data['direct_includes'].to_numpy()
    direct_includes_arr
    direct_includes_freq_count = {}
    for x in direct_includes_arr:
        separate_includes = x.split(";")
        for y in separate_includes:
            direct_includes_freq_count[y] = direct_includes_freq_count.get(y, 0) + 1
    return direct_includes_freq_count

direct_includes_freq_count = calcalate_freq_of_direct_includes(has_direct_include)
direct_includes_freq_count

## Table of domains most directly included using include:domain

In [None]:
direct_includes_freq_count_pd = pd.DataFrame(direct_includes_freq_count, index=['count'])
direct_includes_freq_count_pd = direct_includes_freq_count_pd.transpose()
direct_includes_freq_count_pd.sort_values('count', ascending=False, inplace=True)
direct_includes_freq_count_pd


## Graph of domains most directly included using include:domain (limit to 15 most common results)

In [None]:
direct_includes_freq_count_pd.head(15).plot(kind="barh", figsize=(10,10))
plt.xlabel("Number unique domanins that included this SPF tag")

pass

In [None]:
# with open('data/pyspf.out', 'r', encoding='utf8') as f:
#     pyspf = json.load(f)

## Results for domains that permerrored (i.e. SPF will always fail)

In [None]:
# perm_errors = dict(filter(lambda x: x[1][0]=='permerror', pyspf.items()))
# perm_errors

# WARNING! main_data_backup
perm_errors = main_data_backup.loc[(main_data_backup['pyspf_result'] == 'permerror')]
perm_errors

## Results for domains that temperrored resolution

In [None]:
#temp_errors = dict(filter(lambda x: x[1][0]=='temperror', pyspf.items()))
#temp_errors

# WARNING! main_data_backup
main_data_backup.loc[(main_data_backup['pyspf_result'] == 'temperror')]

## Results for domains that allowed sending from bogus IP

In [None]:
#pass_for_bogus_ip = dict(filter(lambda x: x[1][0]=='pass', pyspf.items()))
#pass_for_bogus_ip

# WARNING! main_data_backup
main_data_backup.loc[(main_data_backup['pyspf_result'] == 'pass')]

## Table of second level domains sorted asc by directly allowed IPv4 range size (i.e. largest range first)

In [None]:
data_table.sort_values(by='extract_largest_direct_ranges_ip4', ascending=True)

## Table of second level domains sorted asc by directly allowed IPv6 range size (i.e. largest range first)

In [None]:
data_table.sort_values(by='extract_largest_direct_ranges_ip6', ascending=True)

## Table of second level domains sorted asc by directly allowed MX range size (i.e. largest range first)

In [None]:
data_table.sort_values(by='extract_largest_direct_ranges_mx', ascending=True)

## Domains with SPF record

In [None]:
data_table.loc[(data_table['has_spf_record'])]

## Domains with DMARC record

In [None]:
has_dmarc = data_table.loc[(data_table['has_dmarc_record'])]

## Domains with SPF but no DMARC

In [None]:
data_table.loc[(data_table['has_spf_record']) & (data_table['has_dmarc_record']==False)]

## Domains with no SPF but DMARC

In [None]:
data_table.loc[(data_table['has_spf_record']==False) & (data_table['has_dmarc_record'])]

In [None]:
from typing import List

def concat_columns(row, columns):
    answer = ""
    for x in columns:
        answer += f'{x}: {str(row[x])};'
    return answer


def graph_columns(dataset, column_names: List[str] = None, custom_lambda = None):
    # spf_and_dmarc = data_table.value_counts("h")
    # print(data_table.iloc[0]["dmarc_rua"])
    assert column_names is None or custom_lambda is None
    assert column_names is not None or custom_lambda is not None

    if column_names:
        custom_lambda = lambda row: concat_columns(row, column_names)
        
    data_set_applied = dataset.apply(custom_lambda, axis=1)
    
    data_set_applied_counted = data_set_applied.value_counts()
    print(data_set_applied_counted)
    
    # plt.style.use('dark_background')
    data_set_applied_counted.plot.pie(
        ylabel="TODO",
        radius=1,
        autopct='%1.1f%%',
        figsize=(10,10)
    ).legend(prop={'size': 20})
    

graph_columns(data_table, ["has_spf_record", "has_dmarc_record"])


In [None]:
graph_columns(data_table, custom_lambda = lambda row: concat_columns(row, ["has_spf_record", "has_dmarc_record"]))

In [None]:
graph_columns(data_table, ["has_spf_record"])

In [None]:
graph_columns(data_table, ["has_dmarc_record"])

In [None]:
graph_columns(data_table, ["dmarc_action"])

In [None]:
graph_columns(data_table, ["has_mx_record"])

In [None]:
graph_columns(data_table, ["is_main_domain"])

In [None]:
graph_columns(data_table, ["extract_largest_direct_ranges_ip4"])

In [None]:
graph_columns(data_table, ["dmarc_has_reporting"])

In [None]:
graph_columns(data_table, ["dmarc_reporting"])

In [None]:
# warning: has_dmarc instead of data_table
graph_columns(has_dmarc, ["dmarc_reporting"])

In [None]:
graph_columns(perm_errors, ["pyspf_msg"])