In [614]:
import os 
import pandas as pd
import numpy as np
from datetime import datetime

## Obtaining Data

In [155]:
path = '../data/raw/'

In [156]:
def df_classifier(path,service,sheet):
    
    df_list =[]
    
    for root, dirs, files in os.walk(path):
        
        for filename in files:
            
            if filename.startswith(service):
                
                df_list.append(pd.read_excel(f'{path}{filename}', sheet_name=sheet))
            
                print(filename)
                
            elif service == 'all':
                
                df_list.append(pd.read_excel(f'{path}{filename}', sheet_name=sheet))
                
                print(filename)
                
    return df_list

In [367]:
df_list = df_classifier(path, 'all', 1)

hl1986to2001.xlsx
gtgg1986to2001.xlsx
hl2010toPresent.xlsx
hl2002to2009.xlsx
lng2011toPresent.xlsx
gd2010toPresent.xlsx
gtgg2002to2009.xlsx
gtggungs2010toPresent.xlsx
gdmar2004to2009.xlsx
gd1986tofeb2004.xlsx


## Cleaning Data

### 1. Drop columns with high nan percentage

In [368]:
def nan_removal(df):

    nan_values = df.isna().sum()
        
    nan_percentage = nan_values / len(df) * 100
        
    filter_nan_percentage = nan_percentage > 20
    
    high_nan_columns = df.columns[filter_nan_percentage].to_list()
    
    return df.drop(columns=high_nan_columns, inplace=True)

### 2. Fill nan values

In [369]:
def nan_col_selection(df):
    
    return df.columns[df.isna().any()].tolist()

In [370]:
def fillna_num_col(df, column_list):
    
    return df[column_list].select_dtypes(include=['float64', 'int64']).fillna(0)

In [371]:
def fillna_cat_col(df, column_list):
    
    return df[column_list].select_dtypes(exclude=['float64', 'int64']).fillna('NO DATA')

In [372]:
def df_clean(df, df_cat, df_num):
    
    for column in df_cat.columns:
    
        df[column] = df_cat[column]
    
    for column in df_num.columns:
    
        df[column] = df_num[column]
        
    return df

In [373]:
for df in df_list:
    
    df.drop(columns='DATAFILE_AS_OF', inplace=True)
    
    nan_removal(df)
    
    df_clean(df, fillna_cat_col(df, nan_col_selection(df)), fillna_num_col(df, nan_col_selection(df)))

### 3. Renaming of variables to be used.

In [528]:
for df in df_list:
    
    df.rename(columns={'ACCTY' : 'LOCATION_CITY_NAME',
                      'ACCNT' : 'LOCATION_COUNTY_NAME',
                       'ACCOUNTY' : 'LOCATION_COUNTY_NAME',
                       'ACCST' : 'LOCATION_STATE_ABBREVIATION',
                       'ACSTATE' : 'LOCATION_STATE_ABBREVIATION',
                       'ACZIP' : 'LOCATION_POSTAL_CODE',
                       'RPTID' : 'REPORT_NUMBER',
                       'INADR' : 'LOCATION_STREET_ADDRESS',
                       'CLASS' : 'CLASS_LOCATION_TYPE',
                       'COMM' : 'COMMODITY_RELEASED_TYPE',
                       'CSYS' : 'SYSTEM_PART_INVOLVED',
                       'OFFSHORE' : 'ON_OFF_SHORE',
                      'OPID' : 'OPERATOR_ID',
                      'IFED' : 'FEDERAL',
                       'IDATE' : 'LOCAL_DATETIME',
                       'INTER' : 'PIPE_FACILITY_TYPE',
                       'TFAT' : 'FATAL',
                       'EFAT' : 'NUM_EMP_FATALITIES',
                       'FAT' : 'FATAL',
                       'TINJ' : 'INJURE',
                       'EINJ' : 'NUM_EMP_INJURIES',
                       'INJ' : 'INJURE',
                       'ACPRS' : 'ACCIDENT_PSIG',
                       'INPRS' : 'ACCIDENT_PSIG',
                       'MXPRS' : 'MOP_PSIG',
                       'DSPRS' : 'MOP_PSIG',
                       'PRTST' : 'MOP_CFR_SECTION',
                       'TEST' : 'EX_HYDROTEST_PRESSURE',
                       'PRTLK' : 'CUSTOMER_TYPE',
                       'MLKD' : 'MATERIAL_INVOLVED',
                       'NMDIA' : 'PIPE_DIAMETER',
                       'THK' : 'WT_STEEL',
                       'SPEC' : 'PIPE_SPECIFICATION',
                       'PRTYR' : 'INSTALLATION_YEAR',
                       'ITMYR' : 'INSTALLATION_YEAR',
                       'MANYR' : 'MANUFACTURED_YEAR',
                       'MANU' : 'PIPE_MANUFACTURER',
                       'LOCLK' : 'INCIDENT_AREA_SUBTYPE',
                       'PNAME' : 'PREPARER_NAME',
                       'PHONE' : 'PREPARER_PHONE',
                       'PROT' : 'UNDER_CATHODIC_PROTECTION_IND',
                       'FACAT' : 'UNDER_CATHODIC_PROTECTION_IND',
                       'CAULK' : 'CAUSE_DETAILS',
                       'ORGLK' : 'ITEM_INVOLVED',
                       'PRTFL' : 'SYSTEM_PART_DETAILS',
                       'LOSS' : 'UNINTENTIONAL_RELEASE_BBLS',
                       'RECOV' : 'RECOVERED_BBLS',
                       'FIRE' : 'IGNITE_IND',
                       'EXP' : 'EXPLODE_IND',
                       'SMYS' : 'PIPE_SMYS',
                       'CORRO' : 'CORROSION_TYPE',
                       'CORLC' : 'EXT_INT_CORROSION'
                      }, inplace=True)

### - hl_1986_to_2001

In [None]:
df_list[0].drop(columns = ['COOR', 'SPLOC', 'TELRN', 'ORGLO',
                          'CAUSO', 'NFAT', 'NINJ', 'CORR', 'PREVT',
                          'JNT', 'MOP_PSIG', 'DUR', 'CAULO', 'TMPMK',
                          'FACTD', 'ONECL', 'ONEOT', 'EXCAL'], inplace=True)

In [480]:
df_list[0].PIPE_FACILITY_TYPE.replace(['YES', 'NO'],['INTERSTATE', 'INTRASTATE'], inplace=True)

In [475]:
df_list[0].OFFSHORE.replace(['YES', 'NO'],['OFFSHORE', 'ONSHORE'], inplace=True)

### - gtgg_1986_to_2001

In [534]:
df_list[1].columns

Index(['SIGNIFICANT', 'SERIOUS', 'SYSTEM_TYPE', 'REPORT_NUMBER', 'OPERATOR_ID',
       'NAME', 'LOCATION_CITY_NAME', 'LOCATION_COUNTY_NAME',
       'LOCATION_STATE_ABBREVIATION', 'LOCATION_POSTAL_CODE', 'MPOST', 'SURVY',
       'DTHH', 'LOCAL_DATETIME', 'CLASS_LOCATION_TYPE', 'SHORE', 'OFFAREA',
       'BNUMB', 'OFFST', 'OCS', 'FEDERAL', 'ITYPE', 'FATAL', 'INJURE',
       'TOTAL_COST', 'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT', 'OPJUD', 'STHH',
       'STMN', 'TELRN', 'TELRT', 'ACCIDENT_PSIG', 'MOP_PSIG', 'MPEST',
       'EX_HYDROTEST_PRESSURE', 'CAUSE', 'MAP_CAUSE', 'MAP_SUBCAUSE',
       'CUSTOMER_TYPE', 'SYSTEM_PART_DETAILS', 'PRTFO', 'MATERIAL_INVOLVED',
       'MLKDO', 'PRTSY', 'PRTSO', 'INSTALLATION_YEAR', 'PIPE_DIAMETER',
       'WT_STEEL', 'PIPE_SPECIFICATION', 'PIPE_SMYS', 'SEAM',
       'PIPE_MANUFACTURER', 'MANUFACTURED_YEAR', 'INCIDENT_AREA_SUBTYPE',
       'LOCLO', 'PREPARER_NAME', 'PREPARER_PHONE', 'DESCO', 'CAUCO',
       'UNDER_CATHODIC_PROTECTION_IND', 'CAUSE_DETAILS', '

### - gd_1986_to_2004

In [None]:
df_list[9].drop(columns = ['OPJUD', 'STHH', 'STMN', 'TELRN', 'TELRT' ,
                           'MPEST', 'NOTIF', 'MARK', 'STAT'], inplace=True)

### 4. Fixing Datetime Column

In [581]:
hl = df_list[0]

In [651]:
def time(df_column):
    
    time = datetime.strptime('{:04d}'.format(int(df_column)), '%H%M').time()
    
    strg = '{0:%H:%M}'.format(time)
    
    return strg[:]

In [652]:
time(1600)

'16:00'

In [661]:
hl.DTHH = hl.DTHH.astype('int64')

type(hl['DTHH'].loc[1])

#hl['hour'] = hl.apply(lambda x: time(x['DTHH']), axis=1)

numpy.int64

### Concatenate datasets

In [662]:
mergedStuff = pd.concat(df_list , ignore_index=True)
mergedStuff.shape

(18693, 270)