In [1]:
import numpy as np
import pandas as pd
import requests
import zipfile
import io

#  What hard-disk model shall we look at?

### The SMART metrics present in the dataset are model-dependent, thus the various models have to be treated differently. In this project we will focus on a single model. 

With this script we aim to capture the most convenient model to focus on. We deduce that the model ST4000DM000 is the most used throughout 2015, 2016, and 2017 with 37600 differend hard disks of which about 7% have failed.   

### We load the datasets and extract the relevant columns and save them in ModelsDetail_15, ModelsDetail_16, and ModelsDetail_17. The relevant columns are :

date : The time stamp of the observations.

serial_number : Uniquely identifies a hard-disk. It is used to determine the number of distinct entries.

model : Identifies the model of the hard-disk. 

failure : A value in {0,1}. When a 1 is present, the hard disk has failed on the specific date and is removed from the dataset.


In [2]:
#Create empty list of dataframes with the data of 2015
ListDF = []

#Load data from 2015

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_2015.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('2015/'))]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)
    
df = pd.concat(ListDF, ignore_index = True)

#Compute entries and failures per model

FailuresPerModel = df.groupby('model')['failure'].sum()
EntriesPerModel = df.groupby('model')['serial_number'].unique()

ModelsDetail_15 = pd.concat([EntriesPerModel,FailuresPerModel],axis=1)


In [3]:
#Create empty list of dataframes with the data of 2016

ListDF = []

#Load data from 2016_Q1

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q1_2016/'))]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)

#Load data from 2016_Q2  

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q2_2016/'))]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)
    
#Load data from 2016_Q3
r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q3_2016/'))]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)    
    
#Load data from 2016_Q4
r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)    
      
df = pd.concat(ListDF, ignore_index = True)

#Compute entries and failures per model
FailuresPerModel = df.groupby('model')['failure'].sum()
EntriesPerModel = df.groupby('model')['serial_number'].unique()

ModelsDetail_16 = pd.concat([EntriesPerModel,FailuresPerModel],axis=1)

#ModelsDetail.to_csv('Data/Wrangled/ModelsDF_16.csv')

In [4]:
#Create empty list of dataframes with the data of 2017
ListDF = []

#Load data from 2017_Q1

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)    
    
    
#Load data from 2017_Q2

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)    
    
    
#Load data from 2017_Q3

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)    
    
       
#Load data from 2017_Q4

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q4_2017/'))]

for file in files : 
    data = pd.read_csv(z.open(file))
    data2 = data[['date','serial_number', 'model', 'failure']]
    ListDF.append(data2)    
        
    
df = pd.concat(ListDF, ignore_index = True)

#Compute entries and failures per model

FailuresPerModel = df.groupby('model')['failure'].sum()
EntriesPerModel = df.groupby('model')['serial_number'].unique()

ModelsDetail_17 = pd.concat([EntriesPerModel,FailuresPerModel],axis=1)


### For every year we saved the number of failures and an array of unique serial numbers

We combine the information across the three years in a common dataset. Observe that the same hard disk may have been used across different years.

In [5]:
ListDF_raw = [ModelsDetail_15.add_suffix('_15'), ModelsDetail_16.add_suffix('_16'), ModelsDetail_17.add_suffix('_17')]

#df_raw = reduce(lambda x, y: pd.merge(x, y, on = 'model'), ListDF_raw)

DfTot = pd.concat(ListDF_raw, axis = 1) 

DfTot.failure_15 = DfTot.failure_15.fillna(0) 
DfTot.failure_16 = DfTot.failure_16.fillna(0) 
DfTot.failure_17 = DfTot.failure_17.fillna(0) 

DfTot['failure_tot'] = DfTot.failure_15 + DfTot.failure_16 + DfTot.failure_17
del DfTot['failure_15']
del DfTot['failure_16']
del DfTot['failure_17']

DfTot['serial_number_tot'] = [np.hstack((DfTot.loc[mod,'serial_number_15'], DfTot.loc[mod,'serial_number_16'], DfTot.loc[mod,'serial_number_17'])) for mod in DfTot.index]

del DfTot['serial_number_15']
del DfTot['serial_number_16']
del DfTot['serial_number_17']

DfTot['entries_tot'] = [len(set(DfTot.loc[mod,'serial_number_tot'])) for mod in DfTot.index]

del DfTot['serial_number_tot']


For every model we have the number of distinct hard disks and the number of failures observed. 
We look at their ratio and filter the models for which too few hard disks have been sampled.

In [6]:
DfTot['ratio_entry_failures'] = DfTot['failure_tot'].divide(DfTot['entries_tot'], axis = 'rows')
DfTot.sort_values('ratio_entry_failures', inplace=True, ascending =False)

DfTot[DfTot['entries_tot'] > 1000]

Unnamed: 0,failure_tot,entries_tot,ratio_entry_failures
WDC WD30EFRX,122.0,1261,0.096749
ST3000DM001,106.0,1170,0.090598
ST4000DM000,2590.0,36700,0.070572
ST31500541AS,112.0,1693,0.066155
Hitachi HDS723030ALA640,44.0,1018,0.043222
Hitachi HDS722020ALA330,152.0,4683,0.032458
ST6000DX000,60.0,1938,0.03096
Hitachi HDS5C3030ALA630,96.0,4608,0.020833
Hitachi HDS5C4040ALE630,42.0,2660,0.015789
ST8000DM002,141.0,10029,0.014059
