In [None]:
import numpy as np
import pandas as pd
import requests
import zipfile
import io

## Save the dataset associated to the model ST4000DM000.

Every model is associated with different SMART metrics. Moreover, for every relevant SMART metric we will work with the normalized value (in between 0 and 100/200/260 depending on the metric, the higher the better in tems of performance)

In [None]:
## NonNullCols has a True value in correspondence of a column we want to save

NonNullCols = [ True,  True,  False,  False,  True, # date, serial number, model, capacity, failure
                False, False, False, False, False, False,  False, False, # SMART 1 - 2 - 3 - 4 
                True,  False, True,  False, False, False,  True,  False, # SMART 5 - 7 - 8 - 9 
                False, False, False, False, False, False,  False, False, # SMART 10 - 11 - 12 -13
                False, False, False, False, True,  False,  False, False, # SMART 15 - 22 - 183 - 184 
                True,  False, False, False, True,  False,  True,  False, # SMART 187 - 188 - 189 - 190
                False, False, False, False, True,  False,  True,  False, # SMART 191 - 192 - 193 - 194 
                False, False, False, False, True,  False,  True,  False, # SMART 195 - 196 - 197 - 198 
                False, False, False, False, False, False,  False, False, # SMART 199 - 200 - 201 - 220
                False, False, False, False, False, False,  False, False, # SMART 222 - 223 - 224 - 225
                False, False, False, False, False, False,  False, False, # SMART 226 - 240 - 241 - 242
                False, False, False, False, False, False,  False, False, False, False] # SMART 250 - 251 - 252 - 254 - 255

# model and capacity are constant when we select only model ST4000DM000

# SMART 2 - 8 - 11 - 13 - 15 - 22 - 195 - 196 - 200 - 201 - 220 - 222 - 223 - 224 - 225 - 226 - 250 - 251 - 252 - 254 - 255
# Are not present for model ST4000DM000

# SMART 1 - 3 - 4 - 10 - 12 - 184 - 188 - 191 - 192 - 199 - 240 - 241 - 242 
# Have a very low (some null) variance across the 2015 dataset and those columns are removed as well from the 

ModelName = 'ST4000DM000'

### For each year we maintain 14 columns (date, serial_number, failure and 11 normalized SMART metrics)

The resulting datasets are saved and have the following features:

2015 : 6.8  Millions of rows and 0.6GB of space

2016 : 12.4 Millions of rows and 1.1GB of space

2017 : 12.2 Millions of rows and 1.1GB of space


In [None]:
ListDF_ST = []

#Load data from 2015
r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_2015.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('2015/'))]   
for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)
    
df = pd.concat(ListDF_ST, ignore_index = True)
df.to_csv('Data/Wrangled/STModel_15.csv')

In [None]:
ListDF_ST = []

#Load data from 2016_Q1

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q1_2016/'))]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)

#Load data from 2016_Q2  

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q2_2016/'))]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)
    
#Load data from 2016_Q3
r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q3_2016/'))]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)    
    
#Load data from 2016_Q4
r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2016.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)
    
df = pd.concat(ListDF_ST, ignore_index = True)
df.to_csv('Data/Wrangled/STModel_16.csv')    


In [None]:
ListDF_ST = []

#Load data from 2017_Q1

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)  
    
    
#Load data from 2017_Q2

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)
    
    
#Load data from 2017_Q3

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if name.endswith('.csv')]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)
    
       
#Load data from 2017_Q4

r = requests.get('https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2017.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))

files = [name for name in z.namelist() if (name.endswith('.csv')) & (name.startswith('data_Q4_2017/'))]

for file in files : 
    data = pd.read_csv(z.open(file), parse_dates = [0])
    data2 = data[data['model'] == ModelName]
    data3 = data2.iloc[:,NonNullCols]
    ListDF_ST.append(data3)    
    
    
df = pd.concat(ListDF_ST, ignore_index = True)
df.to_csv('Data/Wrangled/STModel_17.csv')     
