## Data Cleaning & Locating Multivalued and Duplicate records (single csv file)
### Saksham Gakhar, DA - DKSF

Keep changing the input csv file and look for duplicate and multivalued records, enlist devices that generally misbehave...

In [292]:
import numpy as np 
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from collections import defaultdict
import datetime
# without mpld3
%matplotlib notebook 

In [319]:
def readCSV(dt):
    """
        Read the CSV file into a dataframe for a YYYY-MM (dt)
        Do preliminary cleaning
        arg: dt -- string with format YYYY-MM
        return df: dataframe containing data from csv
    """
    folder = '2020-07-06-DataKind/'
    filename = 'output-' + str(dt) + '-01T00_00_00+00_00.csv'
    df = pd.read_csv(folder+filename)
    df.when_captured = pd.to_datetime(df.when_captured)

    # Need to change the format of the Time Stamp for all the measurements in the raw data

    df.service_uploaded =  df.service_uploaded.apply(lambda x: \
                            datetime.datetime.strptime(x, '%b %d, %Y @ %H:%M:%S.%f')\
                            .replace(tzinfo=datetime.timezone.utc))
    #### Add a column for the year
    df['year'] = pd.DatetimeIndex(df['when_captured']).year
    
    #### Need to correct for the format of the PM numeric values. 
    df['pms_pm01_0'] = df['pms_pm01_0'].astype(str).str.replace(',', '').astype(float)
    df['pms_pm10_0'] = df['pms_pm10_0'].astype(str).str.replace(',', '').astype(float)
    df['pms_pm02_5'] = df['pms_pm02_5'].astype(str).str.replace(',', '').astype(float)
    
    df.info()
    df[0:5]
    return df

Based on above table for (`device`, `when_captured`) key, let's see what these multiple values for each time stamp correspond to. Sometimes there are negative RH, sometimes 0.0 PM (which measn very clean air)

In [294]:
def findBadData(df):
    
    temp_df = df.groupby(['device_urn', 'device_sn','when_captured']).size().to_frame('size').\
                                    reset_index().sort_values('size', ascending=False)
    print("bad device data counts: ")
    badRecords = temp_df[(temp_df['size']>1)]
    print(badRecords)
    
    print("all bad device list: ")
    # Devices that have misbehaved at some point - more than one data values per time stamp
    print(np.unique(temp_df[temp_df['size']>1]['device_sn'].values)) # devices that have misbehaved
    
    return badRecords

## Data Cleansing based on [Protocol](https://github.com/DataKind-SF/safecast/blob/master/Solarcast_data_cleansing.md)

In [295]:
def rmInvalidTimeStamps(df):
    """
    remove invalid time stamped records
    """
    
    ## remove records with NULL `when_captured`
    print("Null date records to remove: ", df['when_captured'].isna().sum())
    df = df[df['when_captured'].notna()]
    print("df shape after remove records with NULL `when_captured` : ",df.shape)

    ## remove records where `when_captured` is an invalid
    boolean_condition = df['when_captured'] >  pd.to_datetime(2000/1/19, infer_datetime_format=True).tz_localize('UTC')
    print("Valid `when_captured`  entires: ", boolean_condition.sum())
    df = df[df['when_captured'] >  pd.to_datetime(2000/1/19, infer_datetime_format=True).tz_localize('UTC')]
    print("df shape after remove records where `when_captured` is an invalid : ",df.shape)

    ## remove records where gap of `service_uploaded` and `when_captured` > 7 days
    boolean_condition = abs(df['when_captured'].subtract(df['service_uploaded'])).astype('timedelta64[D]') < 7
    boolean_condition.shape
    print("Lag 7 days to remove: ",df.shape[0] - (boolean_condition).sum())
    df = df[boolean_condition]
    print("df shape after records where gap of `service_uploaded` and `when_captured` > 7 days : ",df.shape)
    
    return df

In [296]:
def imputeInaccurateRH(df):
    """ 
    impute data with NaN(missing) for inaccurate values of RH
    """
    
    boolean_condition = (df['env_humid']<0) | (df['env_humid']>100)
    column_name = 'env_humid'
    new_value = np.nan
    df.loc[boolean_condition, column_name] = new_value
    print("Inaccurate RH records imputed: ", boolean_condition.sum())
    
    return df

In [297]:
def dropServiceUploaded(df):
    """
    Inplace dropping of the 'service_uploaded' column
    """
    df.drop('service_uploaded', axis=1, inplace=True)

Drop Duplicates

In [298]:
def rmDuplicates(df):
    """
    Inplace dropping of duplicates
    preserve a single copy of duplicative rows
    """
    incoming = df.shape[0]
    df.drop_duplicates(subset=df.columns[0:df.shape[1]], inplace=True, keep='first') # args: subset=[df.columns[0:df.shape[1]]], keep = 'first'
    print("Number of duplicative entries removed : ", -df.shape[0]+incoming)

# #testing inplace = True and no return in fucntion above
# df = pd.DataFrame(np.arange(12).reshape(3, 4),columns=['A', 'B', 'C', 'D'])
# df.loc[-1] = [0, 1, 2, 3] 
# df
# rmDuplicates(df)
# df

### Filtering bad row records

In [299]:
def dataAggWithKey(df):
    """
    Aggregate the df based on key: 'device_sn','when_captured'
    arg: df - incoming dataframe
    return: datframe with COUNTS and COUNT-DISTINCTS for each key
    """
    # STEP 1: Aggregate the dataframe based on key
    
    temp_df = df.groupby(['device_sn','when_captured']).agg(['count','nunique'])
    # temp_df.info()
    num_groups = temp_df.shape[0]
    print("num_groups  is : ", num_groups)

    # STEP 2: Merge Counts and Count-Distincts to check for duplicative records and multiplicities

    even = list(range(0,26,2))
    odd = list(range(1,26,2))
    tmp_df1 = temp_df.iloc[:,even].max(axis=1).to_frame('COUNTS').reset_index()
    tmp_df2 = temp_df.iloc[:,odd].max(axis=1).to_frame('DISTINCTS').reset_index()
    print(tmp_df1.shape, tmp_df2.shape)
    merged = pd.merge(tmp_df1, tmp_df2, left_on = ['device_sn', 'when_captured'], \
                      right_on=['device_sn', 'when_captured'])
    merged.head
    return merged, num_groups

### Calculating hits: Impose mutually exclusive conditions for filtering

In [300]:
def identifyALLNanRecs(merged):
    """
        Actionable: Records of useless data with all NaNs
        args: incoming datframe with COUNTS and COUNT-DISTINCTS for each key
        return : keys dataframe ('device_sn', 'when_captured') to remove later
    """
    bool1 = (merged.COUNTS >1) & (merged.DISTINCTS==0)
    sum1 = bool1.sum()
    print(sum1)
    toDiscard1 = merged.loc[:,['device_sn', 'when_captured']][bool1]
    toDiscard1.shape
    return sum1, toDiscard1

In [301]:
def identifyMultivaluedTimeStamps(merged):
    """
        Actionable: Records that are a mix of duplicates and non-duplicate rows 
        for a given (`device_sn`, `when_captured`) [must be all discarded]
        args: incoming datframe with COUNTS and COUNT-DISTINCTS for each key
        return : keys dataframe ('device_sn', 'when_captured') to remove later
    """
    bool3 = (merged.COUNTS >1) & (merged.DISTINCTS>1)
    sum3 = bool3.sum()
    print(sum3)
    toDiscard3 = merged.loc[:,['device_sn', 'when_captured']][bool3]
    toDiscard3.shape
    return sum3, toDiscard3

In [302]:
def identifyRemainingDupl(merged):
    """
        NOT Actionable as duplicates were dropped: 
        Records where all rows are purely duplicates [preserve only 1 later]
        args: incoming datframe with COUNTS and COUNT-DISTINCTS for each key
    """
    bool2 = (merged.COUNTS >1) & (merged.DISTINCTS==1)
    sum2 = bool2.sum()
    print("remaining duplicates check : " ,merged.COUNTS[bool2].sum() - merged.DISTINCTS[bool2].sum())
    return sum2

In [303]:
def goodTimeStamps(merged):
    """
        Records that are good
    """
    bool4 = (merged.COUNTS ==1) & (merged.DISTINCTS==1)
    sum4 = bool4.sum()
    print('good records : ', sum4)
    return sum4

In [304]:
def writeDF(dt, dframe, descrpt):
    """
        write multivalued timestamps' keys to a csv
        args: dframe to write
        descrpt: string with descripttion to append to file
    """
    # dframe.info()
    print("written records shape : ", dframe.shape)
    dframe.to_csv(str(dt) + '-01_' + str(descrpt) + '.csv')

### Discard bad data now from the main dataframe


In [305]:
def filterRows(toDiscard1, toDiscard3, df):
    """
        Inplace discarding of rows based on allNaN record keys (in df : toDiscard1)
        and rows based on MultivaluedTimeStamps keys (in df : toDiscard3)
        from original dataframe: df
        args:
            toDiscard1: allNaN record keys
            toDiscard3: MultivaluedTimeStamps keys
            df: original dataframe
    """
    # STEP 1 : 
    # all tuples of keys to be discarded
    discard = pd.concat([toDiscard1, toDiscard3], ignore_index=True)
    discard['KEY_DevSN_WhenCapt'] = list(zip(discard.device_sn, discard.when_captured))
    print(df.shape, discard.shape)

    # STEP 2 :
    # tuples of all keys in the dataframe
    df['KEY_DevSN_WhenCapt'] = list(zip(df.device_sn, df.when_captured))
    df.shape

    # STEP 3 : 
    # discard the rows
    rows_to_discard = df['KEY_DevSN_WhenCapt'].isin(discard['KEY_DevSN_WhenCapt'])
    print("these many rows to discard: ", rows_to_discard.sum())

    incoming = df.shape[0]
    df = df[~rows_to_discard]
    print(incoming - df.shape[0])
    
    return df