In [261]:
import os 
import pandas as pd
import numpy as np
from datetime import datetime

## Obtaining Data

In [262]:
path = '../data/raw/'

In [263]:
def df_classifier(path,service,sheet):
    
    df_list =[]
    
    for root, dirs, files in os.walk(path):
        
        for filename in files:
            
            if filename.startswith(service):
                
                df_list.append(pd.read_excel(f'{path}{filename}', sheet_name=sheet))
            
                print(filename)
                
            elif service == 'all':
                
                df_list.append(pd.read_excel(f'{path}{filename}', sheet_name=sheet))
                
                print(filename)
                
    return df_list

In [264]:
df_list = df_classifier(path, 'all', 1)

hl1986to2001.xlsx
gtgg1986to2001.xlsx
hl2010toPresent.xlsx
hl2002to2009.xlsx
lng2011toPresent.xlsx
gd2010toPresent.xlsx
gtgg2002to2009.xlsx
gtggungs2010toPresent.xlsx
gdmar2004to2009.xlsx
gd1986tofeb2004.xlsx


## Cleaning Data

### 0. Previous modifications before removing nan columns

In [265]:
# Fix locations of datesets hl2010toPresent and gtggungs2010toPresent

list_index = [2, 7]

for i in list_index:
    
    df_list[i].ONSHORE_CITY_NAME.fillna('', inplace=True)
    df_list[i].OFF_ACCIDENT_ORIGIN.fillna('', inplace=True)
    df_list[i].ONSHORE_COUNTY_NAME.fillna('', inplace=True)
    df_list[i].OFFSHORE_COUNTY_NAME.fillna('', inplace=True)
    df_list[i].ONSHORE_STATE_ABBREVIATION.fillna('', inplace=True)
    df_list[i].OFFSHORE_STATE_ABBREVIATION.fillna('', inplace=True)
    
    df_list[i]['LOCATION_CITY_NAME'] = df_list[i].apply(lambda x: 
                                                 x.ONSHORE_CITY_NAME + x.OFF_ACCIDENT_ORIGIN, axis=1)
    df_list[i]['LOCATION_COUNTY_NAME'] = df_list[i].apply(lambda x: 
                                                 x.ONSHORE_COUNTY_NAME + x.OFFSHORE_COUNTY_NAME, axis=1)
    df_list[i]['LOCATION_STATE_ABBREVIATION'] = df_list[i].apply(lambda x: 
                                                 x.ONSHORE_STATE_ABBREVIATION + x.OFFSHORE_STATE_ABBREVIATION, axis=1)

In [266]:
# Fix FATAL and INJURIES columns

for df in df_list:
    
    if 'FATAL' in df:
    
        df['FATAL'].fillna(0, inplace = True)
        
    if 'INJURE' in df:
        
        df['INJURE'].fillna(0, inplace = True)

In [267]:
df_list[3]['INC_PRS'].fillna(0, inplace=True)

### 1. Fixing Datetime Column

In [268]:
def time(df_column):
    
    return pd.to_datetime(df_column, format="%Y-%m-%d %H:%M:%S")

In [269]:
for df in df_list:
    
    df.rename(columns={'IHOUR' : 'DTHH'}, inplace=True)
    
    if 'DTHH' in df:
        
        if df['DTHH'].dtype == 'object':
            
            # It seems there are some rows with 'spaces' we need to feel to convert the column to integer

            df.loc[df['DTHH'].str.contains(r'\s+') == True, ['DTHH']] = '0000'
        
        df['DTHH'].fillna(0, inplace = True)
    
        df['DTHH'] = df['DTHH'].astype('int64')
        
        # Check that there are no times above 2400

        df.loc[df['DTHH'] >= 2400, ['DTHH']] = 0
        
        # Convert column to string

        df['DTHH'] = df['DTHH'].astype('str')

        # Add leading zeros

        df['DTHH'] = df.apply(lambda x: x.DTHH.zfill(4), axis=1)

        # Add trailing zeros

        df['DTHH'] = df.apply(lambda x: x.DTHH.ljust(6, '0'), axis=1)

        # And insert colon in the date to fit the format

        df['DTHH'] = df.apply(lambda x: ':'.join(x['DTHH'][i:i+2] for i in range(0, len(x['DTHH']), 2)), axis=1)

        df['LOCAL_DATETIME'] = df['IDATE'].astype('str').str.cat(df['DTHH'],sep=" ")
        
        df.drop(df[df['LOCAL_DATETIME'] == '1998-09-17 01:63:00'].index, inplace = True)

In [270]:
# Apply Time Function

for df in df_list:
    
    if df['LOCAL_DATETIME'].dtype == 'object':
    
        df['LOCAL_DATETIME'] = df.apply(lambda x: time(x.LOCAL_DATETIME), axis=1)

### 2. Drop columns with high nan percentage

In [271]:
def nan_removal(df, percentage):

    nan_values = df.isna().sum()
        
    nan_percentage = nan_values / len(df) * 100
        
    filter_nan_percentage = nan_percentage > percentage
    
    high_nan_columns = df.columns[filter_nan_percentage].to_list()
    
    return df.drop(columns=high_nan_columns, inplace=True)

### 3. Renaming of variables to be used.

In [272]:
for df in df_list:
    
    df.rename(columns={'ACCTY' : 'LOCATION_CITY_NAME',
                       'FACILITY_NAME' : 'LOCATION_CITY_NAME',
                       'ACCITY' : 'LOCATION_CITY_NAME',
                      'ACCNT' : 'LOCATION_COUNTY_NAME',
                       'ACCOUNTY' : 'LOCATION_COUNTY_NAME',
                       'ACCST' : 'LOCATION_STATE_ABBREVIATION',
                       'ACSTATE' : 'LOCATION_STATE_ABBREVIATION',
                       'FACILITY_STATE' : 'LOCATION_STATE_ABBREVIATION',
                       'ACZIP' : 'LOCATION_POSTAL_CODE',
                       'LATITUDE' : 'LOCATION_LATITUDE',
                       'LONGITUDE' : 'LOCATION_LONGITUDE',
                       'FACILITY_LATITUDE' : 'LOCATION_LATITUDE',
                       'FACILITY_LONGITUDE' : 'LOCATION_LONGITUDE',
                       'RPTID' : 'REPORT_NUMBER',
                       'INADR' : 'LOCATION_STREET_ADDRESS',
                       'CLASS' : 'CLASS_LOCATION_TYPE',
                       'COMM' : 'COMMODITY_RELEASED_TYPE',
                       'CSYS' : 'SYSTEM_PART_INVOLVED',
                       'OFFSHORE' : 'ON_OFF_SHORE',
                       'SHORE' : 'ON_OFF_SHORE',
                       'OFFSHORE_TEXT' : 'ON_OFF_SHORE',
                      'OPID' : 'OPERATOR_ID',
                      'IFED' : 'FEDERAL',
                       'INTER_INTRA':'PIPE_FACILITY_TYPE',
                       'INTER_TEXT' : 'PIPE_FACILITY_TYPE',
                       'INTER' : 'PIPE_FACILITY_TYPE',
                       'TFAT' : 'FATAL',
                       'EFAT' : 'NUM_EMP_FATALITIES',
                       'FAT' : 'FATAL',
                       'TINJ' : 'INJURE',
                       'EINJ' : 'NUM_EMP_INJURIES',
                       'INJ' : 'INJURE',
                       'ACPRS' : 'ACCIDENT_PSIG',
                       'INPRS' : 'ACCIDENT_PSIG',
                       'INC_PRS' : 'ACCIDENT_PSIG',
                       'MAOP' : 'MOP_PSIG',
                       'MXPRS' : 'MOP_PSIG',
                       'DSPRS' : 'MOP_PSIG',
                       'PRTST' : 'MOP_CFR_SECTION',
                       'TEST' : 'EX_HYDROTEST_PRESSURE',
                       'PRTLK' : 'CUSTOMER_TYPE',
                       'MLKD' : 'MATERIAL_INVOLVED',
                       'MLKD_TEXT' : 'MATERIAL_INVOLVED',
                       'NMDIA' : 'PIPE_DIAMETER',
                       'NPS' : 'PIPE_DIAMETER',
                       'THK' : 'WT_STEEL',
                       'SPEC' : 'PIPE_SPECIFICATION',
                       'PRTYR' : 'INSTALLATION_YEAR',
                       'ITMYR' : 'INSTALLATION_YEAR',
                       'MANYR' : 'MANUFACTURED_YEAR',
                       'MANU' : 'PIPE_MANUFACTURER',
                       'LOCLK' : 'INCIDENT_AREA_TYPE',
                       'LOCLK_TEXT' : 'INCIDENT_AREA_TYPE',
                       'PNAME' : 'PREPARER_NAME',
                       'PHONE' : 'PREPARER_PHONE',
                       'PPHONE' : 'PREPARER_PHONE',
                       'PROT' : 'UNDER_CATHODIC_PROTECTION_IND',
                       'FACAT' : 'UNDER_CATHODIC_PROTECTION_IND',
                       'CAULK' : 'CAUSE_DETAILS',
                       'ITYPE' : 'RELEASE_TYPE',
                       'LRTYPE_TEXT' : 'RELEASE_TYPE',
                       'ORGLK' : 'ITEM_INVOLVED',
                       'PRTSY_TEXT' : 'ITEM_INVOLVED',
                       'PRTSY' : 'ITEM_INVOLVED',
                       'PRTFL' : 'SYSTEM_PART_DETAILS',
                       'PRTFL_TEXT' : 'SYSTEM_PART_DETAILS',
                       'LOSS' : 'UNINTENTIONAL_RELEASE_BBLS',
                       'RECOV' : 'RECOVERED_BBLS',
                       'FIRE' : 'IGNITE_IND',
                       'IGNITE' : 'IGNITE_IND',
                       'EXP' : 'EXPLODE_IND',
                       'EXPLO' : 'EXPLODE_IND',
                       'SMYS' : 'PIPE_SMYS',
                       'CORRO' : 'CORROSION_TYPE',
                       'UNINTENTIONAL_RELEASE' : 'UNINTENTIONAL_RELEASE_BBLS'
                      }, inplace=True)

### 4. Fill nan values and remove useless columns

In [273]:
def nan_col_selection(df):
    
    return df.columns[df.isna().any()].tolist()

In [274]:
def fillna_num_col(df, column_list):
    
    return df[column_list].select_dtypes(include=['float64', 'int64']).fillna(0)

In [275]:
def fillna_cat_col(df, column_list):
    
    return df[column_list].select_dtypes(exclude=['float64', 'int64']).fillna('NO DATA')

In [276]:
def df_clean(df, df_cat, df_num):
    
    for column in df_cat.columns:
    
        df[column] = df_cat[column]
    
    for column in df_num.columns:
    
        df[column] = df_num[column]
        
    return df

In [277]:
for df in df_list:
    
    df.drop(columns=[
                        'DATAFILE_AS_OF','OPSTREET', 'OPCITY', 'OPCOUNTY', 'OPSTATE',
                       'OPZIP', 'PPPRP', 'PPPRPCURRENT', 'EMRPRP', 'ACSTREET',
                       'EMRPRPCURRENT', 'ENVPRP', 'ENVPRPCURRENT', 'OPCPRP', 'OPCPRPCURRENT',
                       'PRODPRP', 'PRODPRPCURRENT', 'OOPRP', 'OOPRPCURRENT', 'OOPPRP', 'GASPRP',
                       'GASPRPCURRENT', 'OPPRP', 'OPPRPCURRENT', 'NUM_EMP_FATALITIES',
                       'OOPPRPCURRENT', 'IPE', 'IA_IPE', 'OM_IPE', 'NUM_EMP_INJURIES',
                        'SUPPLEMENTAL_NUMBER', 'REPORT_RECEIVED_DATE', 'REPORT_TYPE',
                        'OPERATOR_STREET_ADDRESS', 'OPERATOR_CITY_NAME', 'CUSTOMER_TYPE',
                        'OPERATOR_STATE_ABBREVIATION', 'OPERATOR_POSTAL_CODE', 'IYEAR',
                        'LOCATION_POSTAL_CODE', 'ONSHORE_POSTAL_CODE', 
                        'ONSHORE_CITY_NAME', 'OFF_ACCIDENT_ORIGIN', 'ONSHORE_COUNTY_NAME', 
                        'OFFSHORE_COUNTY_NAME', 'ONSHORE_STATE_ABBREVIATION', 'OFFSHORE_STATE_ABBREVIATION',
                        'EST_COST_OPER_PAID','EST_COST_OPER_PAID_CURRENT', 'EST_COST_GAS_RELEASED',
                       'EST_COST_GAS_RELEASED_CURRENT', 'EST_COST_PROP_DAMAGE',
                       'EST_COST_PROP_DAMAGE_CURRENT', 'EST_COST_EMERGENCY',
                       'EST_COST_EMERGENCY_CURRENT', 'EST_COST_ENVIRONMENTAL', 'IDATE', 'DTHH',
                       'EST_COST_ENVIRONMENTAL_CURRENT', 'EST_COST_OTHER',
                       'EST_COST_OTHER_CURRENT', 'CORLC', 'EXT_INT_CORROSION',
                        'PREPARER_NAME', 'PREPARER_TITLE', 'PREPARER_EMAIL','PREPARER_PHONE',
                       'PREPARER_TELEPHONE', 'PREPARED_DATE', 'AUTHORIZER_NAME',
                       'AUTHORIZER_TITLE', 'AUTHORIZER_TELEPHONE', 'AUTHORIZER_EMAIL', 'FATALITY_IND',
                        'INJURY_IND','SHUTDOWN_DUE_ACCIDENT_IND', 'INCIDENT_IDENTIFIED_DATETIME',
                           'ON_SITE_DATETIME', 'DESIGNATED_NAME', 'NUM_PUB_EVACUATED',
                           'PIPE_FAC_NAME', 'SEGMENT_NAME', 'FEDERAL', 'LOCATION_TYPE', 
                           'CROSSING','SYSTEM_PART_INVOLVED', 'DESIGNATED_LOCATION', 'WILDLIFE_IMPACT_IND',
                           'SOIL_CONTAMINATION','LONG_TERM_ASSESSMENT', 'REMEDIATION_IND', 
                           'WATER_CONTAM_IND', 'COULD_BE_HCA', 'COMMODITY_REACHED_HCA', 'ACCIDENT_PRESSURE',
                           'PRESSURE_RESTRICTION_IND', 'PART_C_QUESTION_2_IND', 'PIPELINE_FUNCTION', 'SCADA_IN_PLACE_IND', 'CPM_IN_PLACE_IND',
                           'ACCIDENT_IDENTIFIER', 'INVESTIGATION_STATUS', 'EMPLOYEE_DRUG_TEST_IND',
                            'CONTRACTOR_DRUG_TEST_IND','SPILL_TYPE_CATEGORY', 'MOP_CFR_SECTION',
                           'SCADA_OPERATING_IND', 'SCADA_FUNCTIONAL_IND', 'SCADA_DETECTION_IND',
                           'SCADA_CONF_IND','NRC_RPT_NUM', 'NRC_RPT_DATETIME', 'INTENTIONAL_RELEASE'
                          ],errors='ignore', inplace=True)
    
    nan_removal(df, 20)
    
    df_clean(df, fillna_cat_col(df, nan_col_selection(df)), fillna_num_col(df, nan_col_selection(df)))

### 5. Selecting variables

### - hl_1986_to_2001

In [278]:
df_list[0].drop(columns = ['COOR', 'SPLOC', 'TELRN', 'ORGLO',
                          'CAUSO', 'NFAT', 'NINJ', 'CORR', 'PREVT',
                          'JNT', 'MOP_PSIG', 'DUR', 'CAULO', 'TMPMK',
                          'FACTD', 'ONECL', 'ONEOT', 'EXCAL'],errors='ignore', inplace=True)

In [279]:
df_list[0].PIPE_FACILITY_TYPE.replace(['YES', 'NO'],['INTERSTATE', 'INTRASTATE'], inplace=True)

In [280]:
df_list[0].ON_OFF_SHORE.replace(['YES', 'NO'],['OFFSHORE', 'ONSHORE'], inplace=True)

In [281]:
df_list[0].columns

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'PIPE_FACILITY_TYPE', 'LOCATION_STATE_ABBREVIATION',
       'LOCATION_COUNTY_NAME', 'LOCATION_CITY_NAME', 'ON_OFF_SHORE',
       'ITEM_INVOLVED', 'INSTALLATION_YEAR', 'CAUSE', 'MAP_CAUSE',
       'MAP_SUBCAUSE', 'FATAL', 'INJURE', 'TOTAL_COST', 'TOTAL_COST_IN84',
       'TOTAL_COST_CURRENT', 'COMMODITY_RELEASED_TYPE', 'CLASS_LOCATION_TYPE',
       'UNINTENTIONAL_RELEASE_BBLS', 'RECOVERED_BBLS', 'IGNITE_IND',
       'EXPLODE_IND', 'PIPE_DIAMETER', 'WT_STEEL', 'PIPE_SMYS',
       'ACCIDENT_PSIG', 'UNDER_CATHODIC_PROTECTION_IND', 'CORROSION_TYPE',
       'CAUSE_DETAILS', 'NARRATIVE', 'LOCAL_DATETIME'],
      dtype='object')

### - gtgg_1986_to_2001

In [282]:
df_list[1].drop(columns = ['MPOST', 'SURVY', 'OFFAREA', 'BNUMB',
                          'OFFST', 'OCS', 'OPJUD', 'STHH',
                          'STMN', 'TELRN', 'TELRT', 'MPEST', 'PRTFO',
                          'PRTSY', 'PRTSO', 'SEAM', 'LOCLO', 'DESCO',
                            'CAUCO', 'DMGO', 'NOTIF', 'MARK', 'MRKTP',
                          'CAULO', 'STAT', 'CTEST', 'MEDO', 'MLKDO'], errors='ignore', inplace=True)

In [283]:
df_list[1].ON_OFF_SHORE.replace(['YES', 'NO'],['OFFSHORE', 'ONSHORE'], inplace=True)

In [284]:
df_list[1].COMMODITY_RELEASED_TYPE = 'NATURAL GAS'

In [285]:
df_list[1].columns

Index(['SIGNIFICANT', 'SERIOUS', 'SYSTEM_TYPE', 'REPORT_NUMBER', 'OPERATOR_ID',
       'NAME', 'LOCATION_CITY_NAME', 'LOCATION_COUNTY_NAME',
       'LOCATION_STATE_ABBREVIATION', 'CLASS_LOCATION_TYPE', 'ON_OFF_SHORE',
       'RELEASE_TYPE', 'FATAL', 'INJURE', 'TOTAL_COST', 'TOTAL_COST_IN84',
       'TOTAL_COST_CURRENT', 'ACCIDENT_PSIG', 'MOP_PSIG',
       'EX_HYDROTEST_PRESSURE', 'CAUSE', 'MAP_CAUSE', 'MAP_SUBCAUSE',
       'SYSTEM_PART_DETAILS', 'MATERIAL_INVOLVED', 'ITEM_INVOLVED',
       'INSTALLATION_YEAR', 'PIPE_DIAMETER', 'WT_STEEL', 'PIPE_SPECIFICATION',
       'PIPE_SMYS', 'PIPE_MANUFACTURER', 'MANUFACTURED_YEAR',
       'INCIDENT_AREA_TYPE', 'UNDER_CATHODIC_PROTECTION_IND', 'CAUSE_DETAILS',
       'NARRATIVE', 'LOCAL_DATETIME'],
      dtype='object')

### - gd_1986_to_2004

In [286]:
df_list[9].drop(columns = ['OPJUD', 'STHH', 'STMN', 'TELRN', 'TELRT' ,
                           'MPEST', 'NOTIF', 'MARK', 'STAT'],errors='ignore', inplace=True)

In [287]:
df_list[9].columns

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'LOCATION_STREET_ADDRESS', 'LOCATION_CITY_NAME', 'LOCATION_COUNTY_NAME',
       'LOCATION_STATE_ABBREVIATION', 'CLASS_LOCATION_TYPE', 'FATAL', 'INJURE',
       'TOTAL_COST', 'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT', 'ACCIDENT_PSIG',
       'MOP_PSIG', 'EX_HYDROTEST_PRESSURE', 'CAUSE', 'MAP_CAUSE',
       'MAP_SUBCAUSE', 'SYSTEM_PART_DETAILS', 'MATERIAL_INVOLVED',
       'PIPE_DIAMETER', 'WT_STEEL', 'PIPE_SPECIFICATION', 'PIPE_MANUFACTURER',
       'MANUFACTURED_YEAR', 'INSTALLATION_YEAR', 'INCIDENT_AREA_TYPE',
       'UNDER_CATHODIC_PROTECTION_IND', 'CAUSE_DETAILS', 'NARRATIVE',
       'LOCAL_DATETIME'],
      dtype='object')

### - hl_2002_to_2009

In [288]:
df_list[3].drop(columns = ['DOR', 'IYEAR', 'SPILLED', 'CLASS_TEXT', 'SPUNIT_TEXT',
                           'PEMAIL', 'NFAT', 'GPFAT', 'NINJ', 'GPINJ',
                          'IO_DRUG', 'IO_ALCO'],errors='ignore', inplace=True)

In [289]:
df_list[3].columns

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'LOCATION_LATITUDE', 'LOCATION_LONGITUDE', 'LOCATION_CITY_NAME',
       'LOCATION_COUNTY_NAME', 'LOCATION_STATE_ABBREVIATION', 'TOTAL_COST',
       'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT', 'COMMODITY_RELEASED_TYPE',
       'UNINTENTIONAL_RELEASE_BBLS', 'RECOVERED_BBLS', 'ON_OFF_SHORE',
       'ACCIDENT_PSIG', 'FATAL', 'INJURE', 'CAUSE', 'MAP_CAUSE',
       'MAP_SUBCAUSE', 'LOCAL_DATETIME'],
      dtype='object')

### - gtgg_2002_to_2009

In [290]:
df_list[6].drop(columns = ['DOR', 'IYEAR', 'OCS', 'HIGHCON',
                           'PEMAIL', 'NFAT', 'GPFAT', 'NINJ', 'GPINJ', 'EVAC',
                           'EVACNO', 'STHH', 'TELRN', 'TELDT',
                          'MAOPSEC1', 'MAOPSEC2', 'MAOPSEC3', 'MAOPSEC4',
                           'MAOPSECC', 'OVERPRS', 'PLAS_DUCT', 'PLAS_BRIT',
                           'PLAS_JNT', 'TYSYS_TEXT'
                          ],errors='ignore', inplace=True)

In [291]:
df_list[6].columns

Index(['SIGNIFICANT', 'SERIOUS', 'SYSTEM_TYPE', 'REPORT_NUMBER', 'OPERATOR_ID',
       'NAME', 'LOCATION_STATE_ABBREVIATION', 'LOCATION_LATITUDE',
       'LOCATION_LONGITUDE', 'ON_OFF_SHORE', 'PIPE_FACILITY_TYPE',
       'RELEASE_TYPE', 'FATAL', 'INJURE', 'TOTAL_COST', 'TOTAL_COST_IN84',
       'TOTAL_COST_CURRENT', 'IGNITE_IND', 'EXPLODE_IND', 'ACCIDENT_PSIG',
       'MOP_PSIG', 'SYSTEM_PART_DETAILS', 'MATERIAL_INVOLVED', 'ITEM_INVOLVED',
       'INSTALLATION_YEAR', 'PIPE_DIAMETER', 'INCIDENT_AREA_TYPE', 'CAUSE',
       'CAUSE_DETAILS', 'MAP_CAUSE', 'MAP_SUBCAUSE', 'NARRATIVE',
       'LOCAL_DATETIME'],
      dtype='object')

### - gd_2004_to_2009

In [292]:
df_list[8].drop(columns = ['FF', 'DOR', 'IYEAR', 'OCS', 'HIGHCON',
                           'PEMAIL', 'NFAT', 'GPFAT', 'NINJ', 'GPINJ', 'EVAC',
                           'EVACNO', 'STHH', 'STMN', 'TELRN', 'TELDT',
                           'MAOPEST', 'OVERPRS', 'PLAS_DUCT', 'PLAS_BRIT',
                           'PLAS_JNT', 'TYSYS_TEXT', 'NOTIF', 'MARKED', 'PERM_MARK',
                           'MKD_IN_TIME', 'PIPE_DAMAGE', 'PRS_TEST' 
                          ],errors='ignore', inplace=True)

In [293]:
df_list[8].columns

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'LOCATION_CITY_NAME', 'LOCATION_COUNTY_NAME',
       'LOCATION_STATE_ABBREVIATION', 'CLASS_LOCATION_TYPE', 'RELEASE_TYPE',
       'FATAL', 'INJURE', 'TOTAL_COST', 'TOTAL_COST_IN84',
       'TOTAL_COST_CURRENT', 'IGNITE_IND', 'EXPLODE_IND', 'ACCIDENT_PSIG',
       'MOP_PSIG', 'SYSTEM_PART_DETAILS', 'MATERIAL_INVOLVED',
       'INCIDENT_AREA_TYPE', 'CAUSE', 'CAUSE_DETAILS', 'MAP_CAUSE',
       'MAP_SUBCAUSE', 'NARRATIVE', 'LOCAL_DATETIME'],
      dtype='object')

### - hl_2010_to_Present

In [294]:
df_list[2].columns

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'LOCAL_DATETIME', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE',
       'COMMODITY_RELEASED_TYPE', 'UNINTENTIONAL_RELEASE_BBLS',
       'RECOVERED_BBLS', 'NET_LOSS_BBLS', 'FATAL', 'INJURE', 'IGNITE_IND',
       'EXPLODE_IND', 'ON_OFF_SHORE', 'INCIDENT_AREA_TYPE',
       'PIPE_FACILITY_TYPE', 'ITEM_INVOLVED', 'MATERIAL_INVOLVED',
       'RELEASE_TYPE', 'TOTAL_COST', 'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT',
       'ACCIDENT_PSIG', 'MOP_PSIG', 'CAUSE', 'CAUSE_DETAILS', 'MAP_CAUSE',
       'MAP_SUBCAUSE', 'NARRATIVE', 'LOCATION_CITY_NAME',
       'LOCATION_COUNTY_NAME', 'LOCATION_STATE_ABBREVIATION'],
      dtype='object')

### - gtgg_2010_to_Present

In [295]:
df_list[7].drop(columns = ['INCIDENT_AREA_SUBTYPE', 'CLASS_LOCATION_TYPE', 'PIR_RADIUS', 
                          'HEAT_DAMAGE_IND', 'NON_HEAT_DAMAGE_IND',
                           'HCA_FATALITIES_IND', 'EST_COST_INTENT_REL',
                           'EST_COST_INTENT_REL_CURRENT',],errors='ignore', inplace=True)

In [296]:
df_list[7].columns

Index(['SIGNIFICANT', 'SERIOUS', 'SYSTEM_TYPE', 'REPORT_NUMBER', 'OPERATOR_ID',
       'NAME', 'LOCAL_DATETIME', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE',
       'COMMODITY_RELEASED_TYPE', 'UNINTENTIONAL_RELEASE_BBLS',
       'ACCOMPANYING_LIQUID', 'FATAL', 'INJURE', 'IGNITE_IND', 'EXPLODE_IND',
       'ON_OFF_SHORE', 'INCIDENT_AREA_TYPE', 'PIPE_FACILITY_TYPE',
       'ITEM_INVOLVED', 'INSTALLATION_YEAR', 'MATERIAL_INVOLVED',
       'RELEASE_TYPE', 'TOTAL_COST', 'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT',
       'ACCIDENT_PSIG', 'MOP_PSIG', 'CAUSE', 'CAUSE_DETAILS', 'MAP_CAUSE',
       'MAP_SUBCAUSE', 'NARRATIVE', 'LOCATION_CITY_NAME',
       'LOCATION_COUNTY_NAME', 'LOCATION_STATE_ABBREVIATION'],
      dtype='object')

### - gd_2010_to_Present

In [297]:
df_list[5].drop(columns = ['FF','CLASS_LOCATION_TYPE', 'INCIDENT_AREA_SUBTYPE', 'EST_COST_UNINTENTIONAL_RELEASE',
                           'EST_COST_UNINTENT_REL_CURRENT', 'EST_COST_INTENT_REL_CURRENT', 'COMMERCIAL_AFFECTED',
                           'INDUSTRIAL_AFFECTED','RESIDENCES_AFFECTED'],errors='ignore', inplace=True)

In [298]:
df_list[5].columns

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'LOCAL_DATETIME', 'LOCATION_STREET_ADDRESS', 'LOCATION_CITY_NAME',
       'LOCATION_COUNTY_NAME', 'LOCATION_STATE_ABBREVIATION',
       'LOCATION_LATITUDE', 'LOCATION_LONGITUDE', 'COMMODITY_RELEASED_TYPE',
       'UNINTENTIONAL_RELEASE_BBLS', 'FATAL', 'INJURE', 'IGNITE_IND',
       'EXPLODE_IND', 'INCIDENT_AREA_TYPE', 'PIPE_FACILITY_TYPE',
       'INSTALLATION_YEAR', 'MATERIAL_INVOLVED', 'RELEASE_TYPE', 'TOTAL_COST',
       'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT', 'ACCIDENT_PSIG', 'NORMAL_PSIG',
       'MOP_PSIG', 'CAUSE', 'CAUSE_DETAILS', 'MAP_CAUSE', 'MAP_SUBCAUSE',
       'NARRATIVE'],
      dtype='object')

### - LNG_2010_to_Present

In [299]:
df_list[4].drop(columns = ['UNINTENTIONAL_RELEASE_IND','INTENTIONAL_RELEASE_IND', 'EMERGENCY_SHUTDOWN_IND',
                           'RESULTED_FROM_OTHER_IND', 'NUM_OPER_AND_CONTRACTOR_EVAC',
                          'FACILITY_STATUS', 'FACILITY_LIQUID_VAPOR_RATE', 'FACILITY_NUM_VAPORIZERS',
                           'FACILITY_TOTAL_CAPACITY', 'FACILITY_SOURCE_LIQUEFY_IND',
                           'FACILITY_NUMBER_TANKS', 'FACILITY_VOLUME_STORAGE',
                           'EST_COST_INTENTIONAL_RELEASE', 'EST_COST_INTENT_REL_CURRENT',
                           'CCS_IN_PLACE_IND', 'CCS_OPERATING_IND', 'CCS_FUNCTIONAL_IND',
                          ],errors='ignore', inplace=True)

In [300]:
df_list[4].columns

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'LOCAL_DATETIME', 'COMMODITY_RELEASED_TYPE', 'FATAL', 'INJURE',
       'IGNITE_IND', 'EXPLODE_IND', 'LOCATION_CITY_NAME',
       'LOCATION_STATE_ABBREVIATION', 'PIPE_FACILITY_TYPE', 'ITEM_INVOLVED',
       'TOTAL_COST', 'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT', 'CAUSE',
       'CAUSE_DETAILS', 'MAP_CAUSE', 'MAP_SUBCAUSE', 'NARRATIVE'],
      dtype='object')

### 6. Concatenate Datasets and create final Dataframe for Data Visualization

In [342]:
mergedStuff = pd.concat(df_list , ignore_index=True)
mergedStuff.shape

(18692, 51)

In [366]:
df = mergedStuff.isna().sum()
mergedStuff['MATERIAL_INVOLVED'].unique()

array([nan, 'NO DATA', 'STEEL', 'OTHER',
       'MATERIAL OTHER THAN CARBON STEEL', 'CARBON STEEL', 'PLASTIC',
       'COPPER', 'CAST/WROUGHT IRON', 'UNKNOWN', 'DUCTILE IRON',
       'MATERIAL OTHER THAN CARBON STEEL OR PLASTIC',
       'POLYETHELENE PLASTIC', 'OTHER MATERIAL', 'OTHER PLASTIC',
       'CAST IRON', 'POLYETHYLENE PLASTIC'], dtype=object)

In [356]:
def final_df(df, nan_values):

    total_nan = df.isna().sum()
        
    filter_nan = total_nan > nan_values
    
    useless_columns = df.columns[filter_nan].to_list()
    
    return df.drop(columns=useless_columns)

In [359]:
final_df = final_df(mergedStuff, 2000)

In [363]:
final_df.

Index(['SIGNIFICANT', 'SERIOUS', 'REPORT_NUMBER', 'OPERATOR_ID', 'NAME',
       'LOCATION_STATE_ABBREVIATION', 'LOCATION_COUNTY_NAME',
       'LOCATION_CITY_NAME', 'CAUSE', 'MAP_CAUSE', 'MAP_SUBCAUSE', 'FATAL',
       'INJURE', 'TOTAL_COST', 'TOTAL_COST_IN84', 'TOTAL_COST_CURRENT',
       'ACCIDENT_PSIG', 'LOCAL_DATETIME'],
      dtype='object')