In [66]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import plotly.express as px

## Data breaches

### Dataset contents

The dataset includes information on:

* **Year** - the year of the breach
* **Organization type** - industry sector of the organization affected by the breach
* **Method** - the event that resulted in the breach.
* **Records** - number of items in the breach
* **Count of Breaches** - number of breaches that occurred.

In [67]:
data_breaches = pd.read_csv('./Preparation of the dataset/Data Breaches/data_breaches.csv')
data_breaches

Unnamed: 0,Year,Organization type,Method,Total No Records (M),Count of Breaches
0,2016,healthcare,hacked,2.200000,1
1,2020,media,hacked,14.870304,1
2,2020,healthcare,poor security,0.566600,2
3,2013,tech,hacked,3350.159298,14
4,2019,tech,poor security,592.500000,4
...,...,...,...,...,...
163,2010,academic / educational,accidentally exposed,0.043000,1
164,2019,media,hacked,173.000000,1
165,2019,financial,hacked,0.098000,1
166,2020,financial,accidentally exposed,200.000000,1


## Notebook preparation

#### Checking data types

In [68]:
data_breaches.dtypes

Year                      int64
Organization type        object
Method                   object
Total No Records (M)    float64
Count of Breaches         int64
dtype: object

In [69]:
"complete records: " + str(len(data_breaches.dropna(how="any"))) + "; total records: " + str(len(data_breaches))

'complete records: 168; total records: 168'

### Converting organization types to categories

In [70]:
data_breaches["Organization type"].value_counts()

Organization type
tech                      30
financial                 30
government                24
healthcare                23
telecom                   17
retail                    15
media                     10
other                     10
academic / educational     9
Name: count, dtype: int64

In [71]:
data_breaches["Organization type"] = data_breaches["Organization type"].astype("category")

### Converting methods to categories

In [72]:
data_breaches["Method"].value_counts()

Method
hacked                  76
poor security           28
lost / stolen media     24
inside job              16
accidentally exposed    14
unknown                  4
misconfiguration         4
rogue contractor         1
social engineering       1
Name: count, dtype: int64

In [73]:
data_breaches["Method"] = data_breaches["Method"].astype("category")
data_breaches.dtypes

Year                       int64
Organization type       category
Method                  category
Total No Records (M)     float64
Count of Breaches          int64
dtype: object

## Exercises

**Exercise 1**

In [74]:
px.colors.qualitative.swatches()

In [75]:
fig = px.bar(
    data_breaches,
    x="Method",
    y="Count of Breaches",
    color="Organization type",
    color_discrete_sequence=px.colors.qualitative.Dark24,
    barmode="group"
)

fig.update_layout(
    title="Number of Data Breaches by Method and Organization Type",
    title_font=dict(size=24, family="Arial"),
    title_x=0.5,
    xaxis_title="Method",
    yaxis_title="Count of Breaches",
    width=1200,
    height=600,
    font=dict(size=14),
    legend=dict(title="Organization Type", font=dict(size=12)),
    plot_bgcolor="rgba(245, 245, 245, 0.9)",
)

fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor="LightGray")

fig.show()

In [76]:
fig = px.bar(
    data_breaches,
    x="Method",
    y="Count of Breaches",
    color="Organization type",
    color_discrete_sequence=px.colors.qualitative.Dark24,
)

fig.update_layout(
    title="Number of Data Breaches by Method and Organization Type",
    title_font=dict(size=24, family="Arial"),
    title_x=0.5,
    xaxis_title="Method",
    yaxis_title="Count of Breaches",
    width=1200,
    height=600,
    font=dict(size=14),
    legend=dict(title="Organization Type", font=dict(size=12)),
    plot_bgcolor="rgba(245, 245, 245, 0.9)",
)

fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor="LightGray")

fig.show()

**Exercis 2**

In [77]:
fig = px.bar(
    data_breaches,
    x="Method",
    y="Total No Records (M)",
    color="Organization type",
    color_discrete_sequence=px.colors.qualitative.Bold,
    barmode="group"
)

fig.update_layout(
    title="Number of Leaked Records by Method and Organization Type",
    title_font=dict(size=24),
    xaxis_title="Method",
    yaxis_title="Number of leaked records",
    font=dict(size=14),
    width=1000,
    height=600,
    legend=dict(title="Organization Type", font=dict(size=12)),
)

fig.show()

In [78]:
fig = px.bar(
    data_breaches,
    x="Method",
    y="Total No Records (M)",
    color="Organization type",
    color_discrete_sequence=px.colors.qualitative.Bold,
    barmode="stack"
)

fig.update_layout(
    title="Number of Leaked Records by Method and Organization Type",
    title_font=dict(size=24),
    xaxis_title="Method",
    yaxis_title="Number of leaked records",
    font=dict(size=14),
    width=1000,
    height=600,
    legend=dict(title="Organization Type", font=dict(size=12)),
)

fig.show()

**Exercise 3**

In [81]:
fig = px.pie(
  data_frame=data_breaches,
  values="Total No Records (M)",
  names="Method",
  color_discrete_sequence=px.colors.qualitative.Pastel1,
  facet_col="Organization type",
  facet_col_wrap=4,
  hole=0.4
)

fig.update_traces(
  textposition='inside',
  textinfo='percent+label',
  marker_line_width=1,
  marker_line_color="grey",
)

fig.update_layout(
    title="Distribution of Leaked Records by Method and Organization Type",
    width=1150,
    height=900,
    grid=dict(rows=2, columns=4, pattern="independent")
  )

fig.show()