In [166]:
import pandas as pd
import numpy as np

% matplotlib inline
import matplotlib.pyplot as plt

from datetime import datetime

from pandas.tseries.offsets import *

AFRs based on [Backblaze Q3 2017 HDD Reliability Report](https://www.backblaze.com/blog/hard-drive-failure-rates-q3-2017/) and [2016 Cumulative Reliability Reports](https://www.backblaze.com/blog/hard-drive-benchmark-stats-2016/).

2TB Drives Listed in the 2016 report:

 * Western Digital WC20EFRX; AFR 4.94%
 * Seagate ST320LT007; AFR 47.70%
 * Seagate ST32000542AS; AFR 9.79%
 * HGST HDS722020ALA;  AFR 1.58%
 
3TB Drives:

 * Western Digital WD30EZRX; AFR 7.38%
 * Western Digital WD30EFRX; AFR 5.74%
 * Seagate ST33000651AS; AFR 4.27%
 * Seagate ST3000DM001; AFR 26.72%
 * HGST HDS723030ALA; AFR 1.92%
 * HGST HDS5C3030ALA;  AFR 0.82%
 * Toshba DT01ACA300; AFR 3.96%
 
8TB:

 * HGST HUH728080ALE600; AFR 1.70%
 * Seagate ST8000DM002; AFR 1.10%
 * Seagate ST8000NM0055; AFR 1.2%
 
The 12TB drives are too new to have reliablity stats (Backblaze has 20 instances with 500 hours total), but have a quoted reliablity of 0.35%.  Taken with a grain of salt, we've doubled that


In [310]:

labels = ['model','tb','cost','warranty']
hdds = [('WC20EFRX', 2, 85, 3 ), 
        ('WD30EFRX', 3, 106, 3),
        ('MG03ACA300', 3, 138, 3),
        ('ST8000NM0055', 8, 265, 5 ),
        ('ST1000NM0086', 8, 365, 5 ),
        ('ST12000NM007', 12, 460, 5 )] 
hdd_data = pd.DataFrame.from_records(hdds, columns=labels)

epoch = 2000

afr_horizon = 5 #years
#afr_index = pd.period_range(pd.Period(year=1,freq='A'), pd.Period(year=afr_horizon,freq='A'), freq='A')
afr_index = pd.date_range(datetime(epoch,1,1), datetime(epoch+afr_horizon,1,1), freq='A')
afr_data = pd.DataFrame( index=afr_index )

afr_data['WC20EFRX'] = (0.0494, 0.0494, 0.0494, 0.0494, 0.0494)
afr_data['WD30EFRX'] = (0.0574, 0.0574, 0.0574, 0.0574, 0.0574)
afr_data['MG03ACA300'] = (0.0396, 0.0396, 0.0396, 0.0396, 0.0396)   # backblaze doesn't use, using the consumer model
afr_data['ST8000NM0055'] = (0.0120, 0.0120, 0.0120, 0.0120, 0.0120)
afr_data['ST1000NM0086'] = (0.0120, 0.0120, 0.0120, 0.0120, 0.0120)
afr_data['ST12000NM007'] = (0.0120, 0.0120, 0.0120, 0.0120, 0.0120)

print(hdd_data)
print(afr_data)

# hdd_data = pd.DataFrame( {'afr': [0.0494, 0.0574, 0.012, 0.007],
#                           'bytes': [2*10**12, 3*10**12, 8*10**12, 12*10**12],
#                            'cost': [85,106,250,460]},
#                            index = ['2tb','3tb','8tb','12tb'])

          model  tb  cost  warranty
0      WC20EFRX   2    85         3
1      WD30EFRX   3   106         3
2    MG03ACA300   3   138         3
3  ST8000NM0055   8   265         5
4  ST1000NM0086   8   365         5
5  ST12000NM007  12   460         5
            WC20EFRX  WD30EFRX  MG03ACA300  ST8000NM0055  ST1000NM0086  \
2000-12-31    0.0494    0.0574      0.0396         0.012         0.012   
2001-12-31    0.0494    0.0574      0.0396         0.012         0.012   
2002-12-31    0.0494    0.0574      0.0396         0.012         0.012   
2003-12-31    0.0494    0.0574      0.0396         0.012         0.012   
2004-12-31    0.0494    0.0574      0.0396         0.012         0.012   

            ST12000NM007  
2000-12-31         0.012  
2001-12-31         0.012  
2002-12-31         0.012  
2003-12-31         0.012  
2004-12-31         0.012  


In [311]:
labels=['name','drive','qty','redun','purchased']
arrays = [('covis-nas1', 'WC20EFRX', 4, 1, 2011),     # Don't know what kind of drives it uses.  ReadyNAS 1500 uses "consumer drives"
            ('covis-nas3', 'WD30EFRX', 4, 1, 2014),   # Don't know what kind of drives it uses.  ReadyNAS 1500 uses "consumer drives"
            ('covis-nas5', 'MG03ACA300', 4, 1, 2014),
            ('covis-nas6', 'MG03ACA300', 4, 1, 2014),
            ('readynas-8tb', 'ST8000NM0055', 8, 2, 2018),
            ('readynas-10tb', 'ST1000NM0086', 8, 2, 2018),
            ('readynas-12tb', 'ST12000NM007', 8, 2, 2018)
         ]

array_data = pd.DataFrame.from_records( arrays, columns=labels)

print(array_data)

# array_data = pd.DataFrame( columns = ['drive', 'num', 'redun','qty','to_purchase'],
#                           data = [['2tb',4,1,2,0],
#                                   ['3tb',4,1,4,0],
#                                   ['8tb',8,2,1,8 ],
#                                   ['12tb',8,2,1,8] ] )

# ## Look up drive data
# array_data['afr'] = [hdd_data.loc[d].afr for d in array_data['drive']]
# array_data['hdd_cost'] = [hdd_data.loc[d].cost for d in array_data['drive']]
# array_data['bytes'] = [hdd_data.loc[d].bytes for d in array_data['drive']]

            name         drive  qty  redun  purchased
0     covis-nas1      WC20EFRX    4      1       2011
1     covis-nas3      WD30EFRX    4      1       2014
2     covis-nas5    MG03ACA300    4      1       2014
3     covis-nas6    MG03ACA300    4      1       2014
4   readynas-8tb  ST8000NM0055    8      2       2018
5  readynas-10tb  ST1000NM0086    8      2       2018
6  readynas-12tb  ST12000NM007    8      2       2018


In [312]:

rng = pd.date_range(datetime(2018,1,1), periods=10, freq='A')    
p_survival = pd.DataFrame(index=rng)
                         
#print(p_survival)

for index,array in array_data.iterrows():
        
    this_afr = afr_data[ array['drive'] ].copy()   
    this_afr.index = this_afr.index + DateOffset(years=array['purchased']-2000)

    ## Extend to match rng
    idx = pd.date_range(this_afr.index.min(),p_survival.index.max(),freq='A')
    
    # Forward fill to get years we don't have
    this_afr=this_afr.reindex(idx).ffill()

    
    ## Extract the AFRs for the relevant dates
    this_afr=this_afr[p_survival.index]
    this_survival = 1-this_afr
    #print(this_survival)
    
    
    drive_p_survival = np.cumprod(this_survival)
    #print(drive_p_survival)
    
    p_survival[array['name']] = drive_p_survival ** array['qty']
    

p_fail = 1-p_survival
print(p_fail)
    #this_drive = afr_lookup( array['drive'], age )

            covis-nas1  covis-nas3  covis-nas5  covis-nas6  readynas-8tb  \
2018-12-31    0.183434    0.210577    0.149237    0.149237      0.092063   
2019-12-31    0.333220    0.376811    0.276202    0.276202      0.175651   
2020-12-31    0.455530    0.508041    0.384220    0.384220      0.251543   
2021-12-31    0.555405    0.611636    0.476117    0.476117      0.320449   
2022-12-31    0.636959    0.693417    0.554300    0.554300      0.383010   
2023-12-31    0.703553    0.757976    0.620815    0.620815      0.439813   
2024-12-31    0.757931    0.808941    0.677403    0.677403      0.491385   
2025-12-31    0.802335    0.849173    0.725546    0.725546      0.538210   
2026-12-31    0.838593    0.880934    0.766505    0.766505      0.580724   
2027-12-31    0.868201    0.906007    0.801351    0.801351      0.619324   

            readynas-10tb  readynas-12tb  
2018-12-31       0.092063       0.092063  
2019-12-31       0.175651       0.175651  
2020-12-31       0.251543       0.

In [86]:
array_data['p_no_drives_fail_1yr'] = (1-array_data.afr)**array_data.num
array_data['p_no_arrays_fail_1yr'] = array_data.p_no_drives_fail_1yr ** array_data.qty

duration = 5

array_data['p_drive_failure_5yr'] = 1-(1-array_data.afr)**duration
array_data['expected_cost_5yr'] = array_data.p_drive_failure_5yr * array_data.num * array_data.qty * array_data.hdd_cost

array_data['p_no_drives_fail_5yr'] = array_data.p_no_drives_fail_1yr ** duration
array_data['p_no_arrays_fail_5yr'] = array_data.p_no_arrays_fail_1yr ** duration

array_data['startup_cost'] = array_data.hdd_cost * array_data.to_purchase
array_data['tco'] = array_data.startup_cost + array_data.expected_cost_5yr

array_data['capacity'] = (array_data.num-array_data.redun) * array_data.bytes * array_data.qty

In [87]:
print(array_data)

  drive  num  redun  qty  to_purchase     afr  hdd_cost         bytes  \
0   2tb    4      1    2            0  0.0494      85.0  2.000000e+12   
1   3tb    4      1    4            0  0.0574     106.0  3.000000e+12   
2   8tb    8      2    1            8  0.0120     250.0  8.000000e+12   
3  12tb    8      2    1            8  0.0070     460.0  1.200000e+13   

   p_no_drives_fail_1yr  p_no_arrays_fail_1yr  p_drive_failure_5yr  \
0              0.816566              0.666780             0.223772   
1              0.789423              0.388364             0.255890   
2              0.907937              0.907937             0.058577   
3              0.945353              0.945353             0.034513   

   expected_cost_5yr  p_no_drives_fail_5yr  p_no_arrays_fail_5yr  \
0         152.165270              0.363041              0.131799   
1         433.989335              0.306583              0.008835   
2         117.154353              0.616990              0.616990   
3         1