# Description

This notebook visualize some of the important features of the created dataset.

# Import

In [None]:
import sys, os, logging

import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))
from configuration import Configuration
from os.path import join as path
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import matplotlib.pyplot as plt
# Configuration
c = Configuration()

# logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)

# random state
random_state = 123

## Load dataset

In [None]:
df = pd.read_parquet(path(c.path_dataset, "4-dataset", "dataset.parquet"), columns= ["TIME_LAST", c.app, c.org])
int_to_cls = c.load_mapping("int_to_cls")
df["APP"] = df[c.app].replace(int_to_cls) # Replace app id with app name

## How much data does one client have?

In [None]:
df["ORG_ID"].value_counts(sort=True)

## Application distribution between clients

In [None]:
counts = df[["ORG_ID", "APP"]].value_counts().sort_index()
counts.unstack().plot(kind="bar", stacked=False, figsize=(10, 8))

plt.title("Application distribution between clients.")
plt.xlabel("Client ID")
plt.ylabel("#Data")
plt.legend(title="Applications")
plt.tight_layout()
plt.show()

## Network Traffic distribution in time between clients

In [None]:
import pandas as pd

plt.figure(figsize=(18,6))
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%a %H:%M'))

for org_id in range(1,14+1):
    
    df_org = df[df["ORG_ID"] == org_id]
    
    df_org.loc[:, 'TIME_LAST'] = pd.to_datetime(df_org['TIME_LAST'], format="%Y-%m-%dT%H:%M:%S.%f", errors='coerce')
    df_org = df_org.dropna(subset=['TIME_LAST'])
    df_org = df_org.set_index('TIME_LAST')
    df_org = df_org.sort_values(by='TIME_LAST')

    X = '3h'
    Y = '3h' 

    time_windows = df_org.resample(Y).count()

    rolling_counts = time_windows.rolling(X, min_periods=1).sum()

    plt.plot(rolling_counts.index, rolling_counts.iloc[:, 0], label=f'Kliens {org_id}')

    del time_windows, rolling_counts, df_org

# X tengely formázása: csak HH:MM jelenjen meg
plt.xlabel("Time")
plt.ylabel("#Data")
plt.title(f"Traffic distribution between clients.")
plt.legend()
plt.grid()
plt.margins(0)
plt.show()

## Network Traffic distribution in time between applications

In [None]:
import pandas as pd

plt.figure(figsize=(18,6))
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%a %H:%M'))

for app in int_to_cls.values():
    
    df_org = df[df["APP"] == app]
    
    df_org.loc[:, 'TIME_LAST'] = pd.to_datetime(df_org['TIME_LAST'], format="%Y-%m-%dT%H:%M:%S.%f", errors='coerce')
    df_org = df_org.dropna(subset=['TIME_LAST'])
    df_org = df_org.set_index('TIME_LAST')
    df_org = df_org.sort_values(by='TIME_LAST')

    X = '3h'
    Y = '3h'

    
    time_windows = df_org.resample(Y).count()

    # A csúszó időablakokat létrehozzuk
    rolling_counts = time_windows.rolling(X, min_periods=1).sum()

    plt.plot(time_windows.index, time_windows.iloc[:, 0], label=f'{app}')

    del time_windows, df_org

# X tengely formázása: csak HH:MM jelenjen meg
plt.xlabel("Time")
plt.ylabel("#Data")
plt.title("Traffic distribution in time between applications.")
plt.legend()
plt.grid()
plt.margins(0)
plt.show()

## Other visualizations

In [None]:
import pandas as pd

cluster_A = [ 1, 5 , 7, 8, 9, 12]

for apps in [["instagram", "facebook-graph", "discord"],["google-www", "youtube", "snapchat", "spotify"]]:
    plt.figure(figsize=(18,6))
    plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%a %H:%M'))
    for org_id in range(1,14+1):
    
        df_org_app = df[(df["APP"].isin(apps)) & (df["ORG_ID"] == org_id)]
        
        df_org_app.loc[:, 'TIME_LAST'] = pd.to_datetime(df_org_app['TIME_LAST'], format="%Y-%m-%dT%H:%M:%S.%f", errors='coerce')
        df_org_app = df_org_app.dropna(subset=['TIME_LAST'])
        df_org_app = df_org_app.set_index('TIME_LAST')
        df_org_app = df_org_app.sort_values(by='TIME_LAST')

        X = '3h'
        Y = '3h'

        time_windows = df_org_app.resample(Y).count()
        

        if org_id in cluster_A:
            plt.plot(time_windows.index, time_windows.iloc[:, 0], linestyle="-", label=f'{org_id}')
        else:
            plt.plot(time_windows.index, time_windows.iloc[:, 0], linestyle=":", label=f'{org_id}')

        del time_windows, df_org_app

    plt.xlabel("Time")
    plt.ylabel("#Flow")
    plt.title(f"#Flow distribution over time for {apps}")
    plt.legend()
    plt.grid()
    plt.show()
    plt.clf()
    plt.close()

In [None]:
import pandas as pd

cluster_A = [ 1, 5 , 7, 8, 9, 12]

for app in ["instagram", "facebook-graph", "discord", "google-www", "youtube", "snapchat", "spotify"]:
    plt.figure(figsize=(18,6))
    plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%a %H:%M'))
    for org_id in range(1,14+1):
    
        df_org_app = df[(df["APP"] == app) & (df["ORG_ID"] == org_id)]
        
        df_org_app.loc[:, 'TIME_LAST'] = pd.to_datetime(df_org_app['TIME_LAST'], format="%Y-%m-%dT%H:%M:%S.%f", errors='coerce')
        df_org_app = df_org_app.dropna(subset=['TIME_LAST'])
        df_org_app = df_org_app.set_index('TIME_LAST')
        df_org_app = df_org_app.sort_values(by='TIME_LAST')

        X = '3h'
        Y = '3h'

        time_windows = df_org_app.resample(Y).count()
        if org_id in cluster_A:
            plt.plot(time_windows.index, time_windows.iloc[:, 0], linestyle="-", label=f'{org_id}')
        else:
            plt.plot(time_windows.index, time_windows.iloc[:, 0], linestyle=":", label=f'{org_id}')

        del time_windows, df_org_app

    # X tengely formázása: csak HH:MM jelenjen meg
    plt.xlabel("Time")
    plt.ylabel("#Flow")
    plt.title(f"#Flow distribution over time for {app}")
    plt.legend()
    plt.grid()
    plt.savefig(path(c.path_results, "EDA", "flow_distribution_over_time_for_apps_per_org.png"))
    plt.show()
    plt.clf()
    plt.close()