## Proof of Data Completeness

### Part 1: Reading Files Individually

In [1]:
import pandas as pd
#helper function
#splits error lines into a k, v pair 
nums = set(["1", "2", "3", "4", "5", "6", "7", "8", "9", "0"])

def split_error (line: str) -> str:
    for i in range(len(line)):
        if line[i] in nums:
            return (line[:i].strip(), int(line[i:]))
    
    #no num found, return line
    return (line.strip(), 0)


#First we will create a function to read all of the files individually.
def read_report(file_name: str):
    #open file
    with open(file_name) as curr:
        #dictionary to hold informati
        r = {}
        
        # strip lines by new line
        lines = curr.read().split('\n')
        
        # filter lines
        #first grab data center
        data_center = filter(lambda line: "DATACENTER" in line, lines)
        
        #and remove it from lines 
        lines = filter(lambda line: "DATACENTER" not in line, lines)
        
        #then remove redacted lines
        lines = filter(lambda line: "(redacted)" not in line, lines)

        # now remove ERROR MODE
        lines = filter(lambda line: "ERROR MODE" not in line, lines)
        
        #remove automated report
        lines = filter(lambda line: "AUTOMATED REPORT" not in line, lines)
        
        #operation notes
        #remove automated report
        lines = filter(lambda line: "OPERATION NOTES" not in line, lines)
        
        #----------------------------------------------
        lines = filter(lambda line: "----------------------------------------------" not in line, lines)
        
        #now end of line
        lines = filter(lambda line: "END OF LINE" not in line, lines)
        
        #remove any empty or only whitespace lines
        lines = filter(lambda line: len(line.strip()) > 0, lines)
        
        #now remove the numbers, we are only looking for the unique error types, also strip whitespace
        lines = [split_error(line) for line in lines]
        
        #finally convert the split lines into a dictionary
        for k, v in lines:
            r[k[:-1]] = v
        # return tuple with dict at 0 and data_center at 1
        return (r, list(data_center)[0])
        
        
#call it on the first file 
read_report("./reports/000000.dat")

({'Fiber pipeline in': 2,
  'Fiber pipeline out': 0,
  'HVAC': 1,
  'Misc. elec.': 5,
  'Operator (employee) error': 0,
  'Operator (non-employee) error': 0,
  'Physical intrusion (person)': 0,
  'Physical intrusion (water)': 2,
  'Power/generator loss': 0,
  'Power/generator reduction': 1},
 'DATACENTER 000000')

### Part 2: Confirming Data Center Uniqueness and Data Completeness

In [2]:
#we will iterate over all files, collecting the result from the above function for each
NUM_FILES = 1431

#create collections to hold all unique data centers and errors
data_centers = []
errors = set()

for i in range(NUM_FILES):
    file_name = f'./reports/{i:06d}.dat'
    result = read_report(file_name)
    
    # add result to our sets
    for v in result[0].keys():
        errors.add(v)
        
    data_centers.append(result[1])
    

errors

{'A/C',
 'Air Con.',
 'Fiber pipeline in',
 'Fiber pipeline out',
 'HVAC',
 'Misc. elec.',
 'Operator (employee) error',
 'Operator (non-employee) error',
 'Physical intrusion (person)',
 'Physical intrusion (water)',
 'Power/generator loss',
 'Power/generator reduction'}

In [3]:
#confirm data centers were added sequentially
data_centers == sorted(data_centers)

True

### Part 3: Combining Data

In [4]:
#we will iterate over all files, collecting the result from the above function for each
NUM_FILES = 1431


data = pd.DataFrame(columns = errors)
for i in range(NUM_FILES):
    #empty dic so we capture 0s
    observation = {c: 0 for c in errors}
    # read file 
    file_name = f'./reports/{i:06d}.dat'
    result = read_report(file_name)
    # add to observation
    for k, v in result[0].items():
        observation[k] = v
    
    #add to df
    data.loc[len(data.index)] = observation.values() 

data
    

Unnamed: 0,Misc. elec.,Physical intrusion (person),Physical intrusion (water),Power/generator reduction,A/C,Operator (non-employee) error,HVAC,Fiber pipeline out,Air Con.,Fiber pipeline in,Operator (employee) error,Power/generator loss
0,5,0,2,1,0,0,1,0,0,2,0,0
1,3,0,0,1,3,0,0,0,0,0,0,0
2,5,0,0,0,0,0,3,0,0,0,0,0
3,19,0,1,0,0,1,2,1,0,2,0,0
4,4,0,0,2,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1426,5,0,0,0,0,0,2,0,0,2,0,0
1427,8,0,1,1,0,0,4,0,0,2,0,0
1428,9,0,1,1,0,0,2,0,0,3,0,0
1429,4,0,1,1,0,0,5,0,0,1,0,0
