# USB Data
----

## Understanding USB Data

In [1]:
# %load eda.py
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
import numpy as np
import pandas as pd
import dask.dataframe as dd
import pickle
import seaborn as sns             # for static plots
import matplotlib.pyplot as plt
from bqplot import *              # for interactive plots
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
fig_layout = widgets.Layout(width = "100%")
sns.set_context('notebook', font_scale = 1.1)
np.random.seed(12345)
rc = {'xtick.labelsize': 40, 'ytick.labelsize': 40, 'axes.labelsize': 40, 'font.size': 40, 'lines.linewidth': 4.0, 
      'lines.markersize': 40, 'font.family': "serif", 'font.serif': "cm", 'savefig.dpi': 200,
      'text.usetex': False, 'legend.fontsize': 40.0, 'axes.titlesize': 40, "figure.figsize": [24, 16]}
sns.set(rc = rc)
sns.set_style("ticks")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from warnings import filterwarnings
filterwarnings("ignore")

## Import the standardized data

In [2]:
# First, specify data source here
hard_disk = "/opt/usb/"
folder = "cert/standardized/usb"
file_type = "*.parquet" # may include csv, hdf, json, text, etc
data_path = os.path.join(hard_disk, folder, file_type)

In [3]:
def import_data (path):
    """Import data from the source into a distributed dask dataframe.  
    Then check whether data is standardized to Haystax's core-data-dictionary.
    Then check whether the data types are correct i.e categorical, int, datetime, string 
    """
    try:
        ddf = dd.read_parquet(path)
    except IOError:
        print("Cannot find this data source: "+hard_disk+folder)
        print("Contact David Jones for the correct data source")
        print("Quiting!")
        quit()


    # Check if data is standardized to Haystax's core data dictionary    
    required_columns = ["record_id", "employee_id", "timestamp", "computer_id", 
                        "file_tree", "connect_activity"]
    try:
        if not (ddf.columns == required_columns).all():
            raise ValueError
    except:
        print("The columns in the data are not standardized.")
        print("Quiting!")
        print("Contact David Jones for Haystax's standard-core-data dictionary")
        quit() 
        
    

     
        #     # Check if the columns are of the right data type i.e categorical, datetime, int, or string    
#     required_data_types = ["category", "category", "category",
#                         "str", "datetime64[ns]", "int64", 
#                         "float", "str", "datetime64[ns]"]
    
#     data_types = [ddf.record_id.dtype == 'category', 
#                   ddf.sender_employee_id.dtype == 'category',
#                   ddf.sender_username.dtype == 'category',
#                   ddf.subject.dtype == 'str',
#                   ddf.timestamp.dtype == 'datetime64[ns]',
#                   ddf.number_of_attachments.dtype == 'int64',
#                   ddf.attachment_size.dtype == 'float',
#                   ddf.email_text.dtype == 'str',
#                   ddf.file_date.dtype == 'datetime64[ns]']
#     data_types = [True, True, True, False, True, False, False, False, False]
    
#     try:
#         if not (data_types == required_columns).all():
#             raise ValueError
#     except:
#         print("The columns in the data are not of the correct data type.")
#         print("Quiting!")
#         print("Contact David Jones for Haystax's standard-core-data dictionary")
#         quit() 

    return ddf

In [4]:
ddf = import_data(path = data_path)

In [5]:
ddf.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
record_id                5 non-null object
sender_employee_id       5 non-null object
sender_username          5 non-null object
subject                  5 non-null object
timestamp                5 non-null datetime64[ns]
number_of_attachments    1 non-null object
attachment_size          5 non-null int64
email_text               5 non-null object
file_date                5 non-null object
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 440.0+ bytes


In [6]:
ddf.columns

Index(['record_id', 'sender_employee_id', 'sender_username', 'subject',
       'timestamp', 'number_of_attachments', 'attachment_size', 'email_text',
       'file_date'],
      dtype='object')

In [7]:
ddf = ddf.categorize(columns = ["record_id", "employee_id", "computer_id", "connect_activity"])

In [128]:
df = ddf.get_partition(n = 1).compute()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87291 entries, 0 to 87290
Data columns (total 9 columns):
record_id                87291 non-null category
sender_employee_id       87291 non-null category
sender_username          87291 non-null category
subject                  87291 non-null object
timestamp                87291 non-null datetime64[ns]
number_of_attachments    20066 non-null object
attachment_size          87291 non-null int64
email_text               87291 non-null object
file_date                87291 non-null object
dtypes: category(3), datetime64[ns](1), int64(1), object(4)
memory usage: 409.1+ MB


In [129]:
df.to_csv(os.path.join(hard_disk, folder, "usb_filtered.csv"))

In [130]:
df.groupby("employee_id").sender_employee_id.count().sort_values(ascending = False).head(n = 10)

sender_employee_id
GKL0006    135
DMH0011    125
LSL0001    108
BOR2387     95
WRW2331     92
UMB0055     90
SRS0129     90
MJA2380     89
BVC0009     84
KAM0005     81
Name: sender_employee_id, dtype: int64

In [131]:
ddf.npartitions

127

## What is the email attachment size?

Let's visualize this data using `matplotlib` and `seaborn`. 

In [115]:
df_insider = df[df["sender_employee_id"].isin(["CDE1846", "HDB1666", "LIM1718"])]
per_month = df_insider.resample(rule = "1d", on = "timestamp").sum().reset_index()
per_month["timestamp"] = pd.to_datetime(emails_per_month["timestamp"], format = "%Y-%m-%d")
per_month.attachment_size = emails_per_month.attachment_size
per_month["timestamp"] = emails_per_month["timestamp"].apply(lambda x: x.strftime('%Y-%m-%d')).astype("datetime64[ns]")
per_month.head()
per_month.info()

Unnamed: 0,timestamp,attachment_size
0,2010-02-09,12.34769
1,2010-02-10,15.199721
2,2010-02-11,18.015744
3,2010-02-12,15.658219


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
timestamp          4 non-null datetime64[ns]
attachment_size    4 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 144.0 bytes


In [116]:
date_fmt = "%m-%d-%Y"
xs = DateScale(date_format = date_fmt)
ys = LinearScale()
x = per_month.timestamp
y = per_month.attachment_size

bar_chart = Bars(x = x, y = y, scales = {'x': xs, 'y': ys}, colors = ["blue"],
            display_legend = True, labels = ["Email"])

tt = Tooltip(fields = ["x", "y"], formats = ["", "0.2f"], labels = ["Date", "Email Size"])
bar_chart.tooltip = tt
bar_chart.selected_style = {"fill" : "red"}
bar_chart.interactions = {
    'legend_hover': 'highlight_axes',
    'hover': 'tooltip', 
    'click': 'select',
}

x_ax = Axis(scale=xs, label='Days of the Year', grid_lines='solid')
y_ax = Axis(scale=ys, orientation='vertical', tick_format='0.2f', label='Number of Emails', grid_lines='solid')


panzoom = PanZoom(scales={'x': [xs], 'y': [ys]})

fig1 = Figure(marks=[bar_chart], axes=[x_ax, y_ax], animation_duration=5000, layout = fig_layout, 
              background_style={'fill': 'aliceblue'}, 
              title = "Number of Emails sent Daily", legend_location = "top-right")
    

widgets.VBox([fig1])

VBox(children=(Figure(animation_duration=5000, axes=[Axis(label='Days of the Year', scale=DateScale()), Axis(l…

Now let's look at the case where the insider sent email IP from their employee account to their home account. Visualizing this data shows some interesting trends towards the end of the analysis period. The attachment size increases drastically in March and April of 2011.

In [132]:
df_insider_non_org = df_insider[~df_insider['to'].str.contains('dtaa.com')]
df_insider_ewing = df_insider_non_org[df_insider_non_org['to'] == 'Ewing_Carlos@comcast.net']
df1 = df_insider_ewing.resample('1d', on='timestamp').sum().reset_index()

In [None]:
df["ds"] = df["ds"].apply(lambda x: x.strftime('%Y-%m')).astype(str)

In [None]:
fig, ax = plt.subplots()
sns.barplot(data = df, x = "ds", y = "y", color = "blue", saturation = .5)
ax.set_xticklabels(labels = df.ds, rotation = 45)
ax.set_xlabel('Time')
ax.set_ylabel('Total size of emails in GB');

In [None]:
xs = LinearScale()
ys = LinearScale()
x = df.index
y = df.attachment_size/1e6

tt = Tooltip(fields = ["index", "name"], formats = ["", "0.2f"], labels = ["User ID", "Email Size"])

scatter_chart = Scatter(x=x, y=y, scales={'x': xs, 'y': ys}, colors=['red'], 
            tooltip = tt, display_legend = False, labels = ["email"], interactions = {"hover" : "tooltip"})

x_ax = Axis(scale=xs, label='x', grid_lines='solid')
y_ax = Axis(scale=ys, orientation='vertical', tick_format='0.2f', 
            label='Email attachment size (GB)', grid_lines='solid')

# panzoom = PanZoom(scales={'x': [xs], 'y': [ys]})
Figure(marks=[scatter_chart], axes=[x_ax, y_ax], animation_duration=1000, 
      title = "Email attachment size")

## What is the email frequency?


In [None]:
xs = DateScale()
ys = LinearScale()
x = emails_per_month.timestamp
y = emails_per_month.attachment_size/1e6

bar_chart = Bars(x=x, y=y, scales={'x': xs, 'y': ys}, colors=['red'], 
            tooltip = tt, display_legend = False, labels = ["email"], interactions = {"hover" : "tooltip"})

x_ax = Axis(scale=xs, label='Months of the Year', tick_format='%b-%d', grid_lines='solid')
y_ax = Axis(scale=ys, orientation='vertical', tick_format='0.2f', 
            label='Number of Emails', grid_lines='solid')

# tt = Tooltip(fields = ["index", "name"], formats = ["", "0.2f"], labels = ["User ID", "Email Size"])
# panzoom = PanZoom(scales={'x': [xs], 'y': [ys]})

Figure(marks=[bar_chart], axes=[x_ax, y_ax], animation_duration=1000, 
      title = "Number of Emails sent Monthly")