This notebook reads in the data and saves it as a train/test/validate split. 

Next notebook will get us some summary s about each set. 

In [1]:
import gc
import os
import pdb
import random

import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

from matplotlib import pyplot as plt
import seaborn as sns

pbar = ProgressBar()
pbar.register()

In [2]:
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
    "date": "object",
    "serial_number": "object",
    "model": "object",
    "capacity_bytes": "float32",
    "failure": "float32",
    "smart_1_normalized": "float32",
    "smart_1_raw": "float32",
    "smart_2_normalized": "float32",
    "smart_2_raw": "float32",
    "smart_3_normalized": "float32",
    "smart_3_raw": "float32",
    "smart_4_normalized": "float32",
    "smart_4_raw": "float32",
    "smart_5_normalized": "float32",
    "smart_5_raw": "float32",
    "smart_7_normalized": "float32",
    "smart_7_raw": "float32",
    "smart_8_normalized": "float32",
    "smart_8_raw": "float32",
    "smart_9_normalized": "float32",
    "smart_9_raw": "float32",
    "smart_10_normalized": "float32",
    "smart_10_raw": "float32",
    "smart_11_normalized": "float32",
    "smart_11_raw": "float32",
    "smart_12_normalized": "float32",
    "smart_12_raw": "float32",
    "smart_13_normalized": "float32",
    "smart_13_raw": "float32",
    "smart_15_normalized": "float32",
    "smart_15_raw": "float32",
    "smart_16_normalized": "float32",
    "smart_16_raw": "float32",
    "smart_17_normalized": "float32",
    "smart_17_raw": "float32",
    "smart_22_normalized": "float32",
    "smart_22_raw": "float32",
    "smart_23_normalized": "float32",
    "smart_23_raw": "float32",
    "smart_24_normalized": "float32",
    "smart_24_raw": "float32",
    "smart_168_normalized": "float32",
    "smart_168_raw": "float32",
    "smart_170_normalized": "float32",
    "smart_170_raw": "float32",
    "smart_173_normalized": "float32",
    "smart_173_raw": "float32",
    "smart_174_normalized": "float32",
    "smart_174_raw": "float32",
    "smart_177_normalized": "float32",
    "smart_177_raw": "float32",
    "smart_179_normalized": "float32",
    "smart_179_raw": "float32",
    "smart_181_normalized": "float32",
    "smart_181_raw": "float32",
    "smart_182_normalized": "float32",
    "smart_182_raw": "float32",
    "smart_183_normalized": "float32",
    "smart_183_raw": "float32",
    "smart_184_normalized": "float32",
    "smart_184_raw": "float32",
    "smart_187_normalized": "float32",
    "smart_187_raw": "float32",
    "smart_188_normalized": "float32",
    "smart_188_raw": "float32",
    "smart_189_normalized": "float32",
    "smart_189_raw": "float32",
    "smart_190_normalized": "float32",
    "smart_190_raw": "float32",
    "smart_191_normalized": "float32",
    "smart_191_raw": "float32",
    "smart_192_normalized": "float32",
    "smart_192_raw": "float32",
    "smart_193_normalized": "float32",
    "smart_193_raw": "float32",
    "smart_194_normalized": "float32",
    "smart_194_raw": "float32",
    "smart_195_normalized": "float32",
    "smart_195_raw": "float32",
    "smart_196_normalized": "float32",
    "smart_196_raw": "float32",
    "smart_197_normalized": "float32",
    "smart_197_raw": "float32",
    "smart_198_normalized": "float32",
    "smart_198_raw": "float32",
    "smart_199_normalized": "float32",
    "smart_199_raw": "float32",
    "smart_200_normalized": "float32",
    "smart_200_raw": "float32",
    "smart_201_normalized": "float32",
    "smart_201_raw": "float32",
    "smart_218_normalized": "float32",
    "smart_218_raw": "float32",
    "smart_220_normalized": "float32",
    "smart_220_raw": "float32",
    "smart_222_normalized": "float32",
    "smart_222_raw": "float32",
    "smart_223_normalized": "float32",
    "smart_223_raw": "float32",
    "smart_224_normalized": "float32",
    "smart_224_raw": "float32",
    "smart_225_normalized": "float32",
    "smart_225_raw": "float32",
    "smart_226_normalized": "float32",
    "smart_226_raw": "float32",
    "smart_231_normalized": "float32",
    "smart_231_raw": "float32",
    "smart_232_normalized": "float32",
    "smart_232_raw": "float32",
    "smart_233_normalized": "float32",
    "smart_233_raw": "float32",
    "smart_235_normalized": "float32",
    "smart_235_raw": "float32",
    "smart_240_normalized": "float32",
    "smart_240_raw": "float32",
    "smart_241_normalized": "float32",
    "smart_241_raw": "float32",
    "smart_242_normalized": "float32",
    "smart_242_raw": "float32",
    "smart_250_normalized": "float32",
    "smart_250_raw": "float32",
    "smart_251_normalized": "float32",
    "smart_251_raw": "float32",
    "smart_252_normalized": "float32",
    "smart_252_raw": "float32",
    "smart_254_normalized": "float32",
    "smart_254_raw": "float32",
    "smart_255_normalized": "float32",
    "smart_255_raw": "float32",
}

In [6]:
csv_ids = ["Q2_2019"]
DATA_ROOT_DIR = 'Backblaze'
for csv_id in csv_ids:
    df = dd.read_csv(os.path.join(DATA_ROOT_DIR, 'data_{}'.format(csv_id), '*.csv'), dtype=custom_dtypes)

We want to filter our dataset so that it contains only seagate drives.

In [7]:
seagate  = df[df.model.str.startswith("S")]

We will then grab a list of drives that never failed in this time period and those that did in order to ensure that our final datasets include failed drives.  

In [8]:
failed_serials = seagate[seagate['failure'] == 1]['serial_number'].compute()

[########################################] | 100% Completed | 43.9s


In [9]:
working_serials = seagate[~seagate['serial_number'].isin(failed_serials)]\
                    ['serial_number']\
                    .drop_duplicates(keep='last')\
                    .compute()

[########################################] | 100% Completed | 50.8s


In [10]:
len(working_serials)

81923

In [11]:
random.seed(45)
subset_working = random.sample(list(working_serials.values), 49000)

In [12]:
new_seagate = list(failed_serials.values) + subset_working

In [13]:
len(new_seagate)

49490

In [14]:
new_seagate = seagate[seagate.serial_number.isin(new_seagate)]

In [15]:
new_seagate.shape[0].compute()

[########################################] | 100% Completed | 39.0s


4442314

In [16]:
new_seagate.serial_number.nunique().compute()

[########################################] | 100% Completed | 39.9s


49490

In [17]:
new_seagate.head()

[########################################] | 100% Completed |  0.7s


Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2019-04-01,Z305B2QN,ST4000DM000,4000787000000.0,0.0,119.0,222017248.0,,,91.0,...,,,,,,,,,,
3,2019-04-01,ZJV0XJQ0,ST12000NM0007,12000140000000.0,0.0,82.0,162835744.0,,,93.0,...,,,,,,,,,,
6,2019-04-01,ZJV02XWG,ST12000NM0007,12000140000000.0,0.0,77.0,48691520.0,,,89.0,...,,,,,,,,,,
8,2019-04-01,ZJV02XWA,ST12000NM0007,12000140000000.0,0.0,82.0,164830976.0,,,97.0,...,,,,,,,,,,
10,2019-04-01,Z305DEMG,ST4000DM000,4000787000000.0,0.0,116.0,109149656.0,,,96.0,...,,,,,,,,,,


In [18]:
def random_partition(list_in,n):
    random.seed(45)
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]


In [19]:
failed_partition = random_partition(list(failed_serials),10)
working_partition = random_partition(list(subset_working),10)

In [20]:
training = working_partition[0:6] + failed_partition[0:6]
testing = working_partition[6:8] + failed_partition[6:8]
validation = working_partition[8:] + failed_partition[8:]

training = [item for sublist in training for item in sublist]
testing = [item for sublist in testing for item in sublist]
validation = [item for sublist in validation for item in sublist]

In [21]:
print(len(training))

29694


In [22]:
print(len(testing))

9898


In [23]:
print(len(validation))

9898


In [24]:
len(training) + len(testing) + len(validation)

49490

In [25]:
print(len(set(training).intersection(set(testing))) == 0)
print(len(set(training).intersection(set(validation))) == 0)
print(len(set(testing).intersection(set(validation))) == 0)

True
True
True


we have a 60/20/20 split for our data divided by individual hard drives with an similar distribution of failed and working drives in both. We will now save them as 3 separate csv files  

In [26]:
training_seagate = new_seagate[new_seagate.serial_number.isin(training)]
testing_seagate = new_seagate[new_seagate.serial_number.isin(testing)]
validation_seagate = new_seagate[new_seagate.serial_number.isin(validation)]

In [27]:
training_seagate = training_seagate.compute()
training_seagate.to_csv('train_backblaze_seagate_q2_2019.csv')
del (training_seagate)

[########################################] | 100% Completed | 40.6s


In [28]:
testing_seagate = testing_seagate.compute()
testing_seagate.to_csv('test_backblaze_seagate_q2_2019.csv')
del (testing_seagate)

[########################################] | 100% Completed | 39.5s


In [29]:
validation_seagate = validation_seagate.compute()
validation_seagate.to_csv('validation_backblaze_seagate_q2_2019.csv')
del (validation_seagate)

[########################################] | 100% Completed | 39.7s
