# Intel® SSD Data Center Tool Connector

This notebook demonstrates some of the quick analysis that can be done using the TOKIO connector for the Intel SSD Data Center Tool (ISDCT).  The format of the aggregated ISDCT outputs is specific to a tool developed at NERSC by David Paul and is therefore site-specific to NERSC, but the individual parsers for each ISDCT output file are generic.

In [None]:
%matplotlib inline

In [None]:
import numpy as np

import matplotlib
matplotlib.rcParams.update({'font.size': 14})
import matplotlib.pyplot as plt

import tokio.connectors.nersc_isdct

In [None]:
isdct_file = 'Intel_DCT_20170818.tgz'
isdct_data = tokio.connectors.nersc_isdct.NerscIsdct(isdct_file)
isdct_df = isdct_data.to_dataframe()

## Distribution of Lifetime Read/Write Loads

In [None]:
for rw, column in ('read','data_units_read_bytes'), ('write', 'data_units_written_bytes'):
    fig, ax = matplotlib.pyplot.subplots()
    fig.set_size_inches(10, 6)
    fig.suptitle("%s Volume Distribution" % rw.title())

    ax.set_axisbelow(True)
    ax.grid(True)
    ax.set_xlabel("TiB %s" % rw.title())
    ax.set_ylabel("# SSDs")
    (isdct_df[column] / 2.0**40).hist(ax=ax, edgecolor='black')

## Write Amplification Distribution

In [None]:
fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(10, 6)
fig.suptitle("WAF Distribution")

ax.set_axisbelow(True)
ax.grid(True)
ax.set_xlabel("Write Amplification Factor")
ax.set_ylabel("# SSDs")
isdct_df['write_amplification_factor'].hist(ax=ax, edgecolor='black')

## Drive Writes per Day

Remember that our Intel P3608 SSDs have a warranty of 5.0 drive writes per day when provisioned at 1.6 TB capacity.

In [None]:
fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(10, 6)
fig.suptitle("DWPD Distribution")

ax.set_axisbelow(True)
ax.grid(True)
ax.set_xlabel("Drive Writes per Day")
ax.set_ylabel("# SSDs")
drive_writes = isdct_df['data_units_written_bytes'] / isdct_df['physical_size']
dwpd = drive_writes / isdct_df['power_on_hours'] * 24.0
dwpd.hist(ax=ax, edgecolor='black')

## Correlation Scatter Plots

In [None]:
scatter_plots = [
    ('power_on_hours', 'data_units_written_bytes'),
    ('power_on_hours', 'data_units_read_bytes'),
    ('power_on_hours', 'write_amplification_factor'),
    ('smart_pli_lock_loss_count_raw', 'write_amplification_factor'),
]

In [None]:
def scatter_and_fit_plot(df, x_key, y_key, fit=True):
    fig, ax = matplotlib.pyplot.subplots()
    fig.set_size_inches(10, 6)

    x = df[x_key].values
    y = df[y_key].values
    ax.plot(x, y, 'o', alpha=0.5)

    if fit:
        ### attempt a linear fit to generate a visual aid
        m, b = np.polyfit(x, y, 1)
        ax.plot(x, m*x+b, "-")

    ax.set_xlabel(x_key.replace('_', ' ').title())
    ax.set_ylabel(y_key.replace('_', ' ').title())
    plt.grid(True)

In [None]:
for (x_key, y_key) in scatter_plots:
    scatter_and_fit_plot(isdct_df, x_key, y_key)

In [None]:
pli_lock_losses = isdct_df[isdct_df['smart_pli_lock_loss_count_raw'] > 0]
pli_lock_losses[['node_name', 'smart_pli_lock_loss_count_raw', 'power_on_hours']]\
    .sort_values('smart_pli_lock_loss_count_raw', ascending=False)

In [None]:
x_key = 'power_on_hours'
y_key = 'smart_pli_lock_loss_count_raw'
fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(10, 6)

ax.plot(isdct_df[x_key].values,
        isdct_df[y_key].values,
        marker='o',
        linestyle='none',
        alpha=0.5,
        label="All SSDs")
ax.plot(pli_lock_losses[x_key],
        pli_lock_losses[y_key],
        marker='o',
        linestyle='none',
        alpha=0.5,
        color='red',
        markersize=10,
        markerfacecolor='none',
        label="Nonzero PLI Lock Loss")

ax.legend(loc='upper right')
ax.set_xlabel(x_key.replace('_', ' ').title())
ax.set_ylabel(y_key.replace('_', ' ').title())
plt.grid(True)