In [None]:
#---------------------------Execute this cell without changing anything-----------------------------------------
import os
import pandas as pd
import datetime
from math import sin, asin, cos, radians, fabs, sqrt

In [None]:
#---------------------------Execute this cell without changing anything-----------------------------------------
def hav(theta):
    """
    :param theta: difference of longitude's angle, float
    :return:
    s*s = product of the sine of theta/2, float
    """
    s = sin(theta / 2)
    return s * s


def get_distance_hav(lat0, lng0, lat1, lng1):
    """
    :param lat0: 1st location_latitude, float
    :param lng0: 1st location_longitude, float
    :param lat1: 2nd location_latitude, float
    :param lng1: 2nd location_longitude, float
    :return:
    distance: distance between 2 points in meter , float
    """
    EARTH_RADIUS = 6371.0
    lat0 = radians(lat0)
    lat1 = radians(lat1)
    lng0 = radians(lng0)
    lng1 = radians(lng1)

    dlng = fabs(lng0 - lng1)
    dlat = fabs(lat0 - lat1)
    h = hav(dlat) + cos(lat0) * cos(lat1) * hav(dlng)
    distance = 2.0 * EARTH_RADIUS * asin(sqrt(h))

    return distance * 1000


def cleaning(c_data, location_devid, city, type_device):
    """
    :param c_data: dataset from csv file, pandas.DataFrame
    :param location_devid: name of csv file, string
    :param city: name of the city: "Cangzhou","Foshan", or "Tianjin", string
    :param type_device: device type: "Static" or "Mobile", string
    :return:
    c_data: processed dataset after data cleaning algorithm, pandas.DataFrame
    """
    stat_dict = {"Original": len(c_data)}
    nullno = 0
    zerono = 0
    smallno = 0
    locno = 0
    aloneTime = 0
    outlier_pre2idx = -1
    outlier_preidx = -1
    loc_preidx = -1
    exe_predix = -1
    speedno = 0
    deno = 0
    adjust_value = []
    station_flag = True
    for i in range(len(c_data.index)):
        # -------------------------Check if PM2.5 or Location is missing----------------------------------

        if True in c_data.iloc[i].isnull().values:
            c_data.iloc[i] = float("nan")
            nullno += 1
            # print("Dropping row " + str(c_data.index[i]) + " because of missing value")
            continue

        # -------------------------Check if PM2.5 or Location equal or less than 0------------------------

        if True in (c_data.iloc[i] <= 0).values:
            c_data.iloc[i] = float("nan")
            zerono += 1
            # print("Dropping row " + str(c_data.index[i]) + " because of equal or less than 0 reading")
            continue

        # -------------------------Check if PM2.5 is less than threshold (=1)-----------------------------

        latitude = c_data["lat"][i]
        longitude = c_data["lon"][i]
        pm2d5 = c_data["pm2d5"][i]

        if pm2d5 < 1:
            c_data.iloc[i] = float("nan")
            smallno += 1
            # print("Dropping row " + str(c_data.index[i]) + " because of small value in PM2.5")
            continue

        # -------------------------Check if location is in the designed region----------------------------

        if city == "Cangzhou":
            if longitude > 116.921283 or longitude < 116.806839 or latitude > 38.361539 or latitude < 38.272036:
                c_data.iloc[i] = float("nan")
                locno += 1
                print("Dropping row " + str(c_data.index[i]) + " because of outside of city")
                continue
        elif city == "Foshan":
            if longitude > 113.396912 or longitude < 112.769371 or latitude > 23.588459 or latitude < 22.647799:
                c_data.iloc[i] = float("nan")
                locno += 1
                print("Dropping row " + str(c_data.index[i]) + " because of outside of city")
                continue
        elif city == "Tianjin":
            if longitude > 118.091252 or longitude < 116.631101 or latitude > 40.267326 or latitude < 38.569749:
                c_data.iloc[i] = float("nan")
                locno += 1
                print("Dropping row " + str(c_data.index[i]) + " because of outside of city")
                continue
        else:
            raise Exception('Select incorrect city')
        # ------Change current timestamp to previous valid timestamps for rest of the checks--------------

        if loc_preidx < 0:  # not process if any previous index is negative (not need to process)
            outlier_pre2idx = outlier_preidx
            outlier_preidx = i
            loc_preidx = i
            exe_predix = i
            continue

        # -------------------------Check if timestamp is repeated-----------------------------------------

        if c_data.index[i] == c_data.index[loc_preidx]:
            c_data.iloc[i] = float("nan")
            deno += 1
            print("Dropping row " + str(c_data.index[i]) + " because of depulicated")
            continue

        # ---Location reassignment for static device/Speed check (threshold=40m/s) for mobile device------

        if type_device == "Static":
            c_data.iloc[i] = fix_dv_loc(c_data.iloc[i], location_devid, city)
            if station_flag:
                station_flag = False
                c_data.iloc[loc_preidx] = fix_dv_loc(c_data.iloc[loc_preidx], location_devid, city)
                print(str(location_devid) + " replace location with " + str(c_data.iloc[i]) + str(
                    c_data.iloc[loc_preidx]))
            loc_preidx = i

        elif type_device == "Mobile":
            prelatitude = c_data["lat"][loc_preidx]
            prelongitude = c_data["lon"][loc_preidx]

            af_distance = get_distance_hav(prelatitude, prelongitude, latitude, longitude)
            af_time_diff = c_data.index[i] - c_data.index[loc_preidx]

            if af_distance / af_time_diff.seconds > 40:
                c_data.iloc[i] = float("nan")
                speedno += 1
                print("Dropping row " + str(c_data.index[i]) + " because of location: invalid speed")
                continue
            loc_preidx = i
        else:
            raise Exception('Select incorrect type of device')

        # ---------Check if the change of PM2.5 concentration exceeds the threshold (45 mug/s)-----------

        prepm2d5 = c_data["pm2d5"][exe_predix]
        af_time_diff = c_data.index[i] - c_data.index[exe_predix]

        if abs(prepm2d5 - pm2d5) / af_time_diff.seconds > 45:
            adjust_value.append(i)
        else:
            exe_predix = i

        # ------------------------Shift the index for rest of the checks---------------------------------

        if outlier_pre2idx < 0:
            outlier_pre2idx = outlier_preidx
            outlier_preidx = i
            continue

        # ---------Check if there is any valid data within the threshold (1 minutes as radius)-----------
        be_time_diff = c_data.index[outlier_preidx] - c_data.index[outlier_pre2idx]
        af_time_diff = c_data.index[i] - c_data.index[outlier_preidx]

        if (be_time_diff > datetime.timedelta(minutes=1)) and (af_time_diff > datetime.timedelta(minutes=1)):
            c_data.iloc[outlier_preidx] = float("nan")
            print("Dropping row " + str(c_data.index[outlier_preidx]) + " because of no data before and after")
            aloneTime += 1
            outlier_preidx = i
            continue

        outlier_pre2idx = outlier_preidx
        outlier_preidx = i

    # -------------Adjust the extreme values of PM2.5 concentration that is detected before--------------

    exno = 0
    print(adjust_value)
    for adj_indx in adjust_value:
        if True in c_data.iloc[adj_indx].isnull().values:
            continue

        preindx = adj_indx - 1
        nextindx = adj_indx + 1

        while True:
            if preindx <= 0:
                break
            if False in c_data.iloc[preindx].isnull().values:
                if preindx not in adjust_value:
                    break
            preindx -= 1

        while True:
            if nextindx >= len(c_data) - 1:
                break
            if False in c_data.iloc[nextindx].isnull().values:
                if nextindx not in adjust_value:
                    break
            nextindx += 1

        if adj_indx == len(c_data) - 1:
            c_data.iloc[adj_indx] = float("nan")
            exno += 1
            print("Dropping row " + str(c_data.index[adj_indx]) + " because of last exetreme value")
            break

        pre_val = c_data["pm2d5"][preindx]
        next_val = c_data["pm2d5"][nextindx]

        pre_T = c_data.index[preindx]
        cur_T = c_data.index[adj_indx]
        next_T = c_data.index[nextindx]

        print("Extreme value found: " + str(cur_T) + " " + str(c_data["pm2d5"][adj_indx]))
        adj_value = (((next_val - pre_val) / ((next_T - pre_T).seconds)) * (cur_T - pre_T).seconds) + pre_val
        print("Adjusted value: " + str(adj_value))
        c_data['pm2d5'][adj_indx] = adj_value
        exno += 1

    # -------------Total number of removals or adjustments in each check---------------------------------

    stat_dict["Null"] = nullno
    stat_dict["Zero"] = zerono
    stat_dict["Small_Value"] = smallno
    stat_dict["Invalid_Loc"] = locno
    stat_dict["Time_outlier"] = aloneTime
    stat_dict["Invalid_speed"] = speedno
    stat_dict["Extreme_value"] = exno
    stat_dict["Duplicated"] = deno

    # ---------------------------------------------------------------------------------------------------
    return c_data


def fix_dv_loc(data_row, name, city):
    """
    :param data_row: a single data row from static device that require location reassignment, pandas.DataFrame
    :param name: name of csv file, string
    :param city: name of the city: "Cangzhou","Foshan", or "Tianjin", string
    :return:
    data_row: modified data row, pandas.DataFrame
    """
    if city == "Cangzhou":
        if name == "D03":
            data_row['lat'] = 38.315787
            data_row['lon'] = 116.858654

        elif name == "D01":
            if data_row["lat"] < 38.318:
                data_row['lat'] = 38.316578
                data_row['lon'] = 116.816628
            elif data_row["lat"] > 38.318:
                data_row['lat'] = 38.322876
                data_row['lon'] = 116.856699
            else:
                raise Exception(str(data_row.index) + " not belong to any location in D01")

        elif name == "D02":
            if data_row["lat"] < 38.3155:
                data_row['lat'] = 38.315219
                data_row['lon'] = 116.851226
            elif data_row["lat"] < 38.3175:
                data_row['lat'] = 38.315811
                data_row['lon'] = 116.858624
            elif data_row["lat"] > 38.3175:
                data_row['lat'] = 38.318135
                data_row['lon'] = 116.854457
            else:
                raise Exception(str(data_row.index) + " not belong to any location in D02")

        elif name == "D04":
            if data_row["lat"] < 38.320:
                data_row['lat'] = 38.312245
                data_row['lon'] = 116.867009
            elif data_row["lat"] > 38.320:
                data_row['lat'] = 38.341947
                data_row['lon'] = 116.892784
            else:
                raise Exception(str(data_row.index) + " not belong to any location in D04")

        elif name == "D05":
            if data_row["lat"] < 38.295:
                data_row['lat'] = 38.291382
                data_row['lon'] = 116.874928
            elif data_row["lat"] < 38.3125:
                data_row['lat'] = 38.310168
                data_row['lon'] = 116.857631
            elif data_row["lat"] > 38.3125:
                data_row['lat'] = 38.315824
                data_row['lon'] = 116.858608
            else:
                raise Exception(str(data_row.index) + " not belong to any location in D05")
        else:
            raise Exception('Static device data is not processed for location reassignment')
    elif city == "Foshan":
        if name == "ETAR7002-056":
            if data_row["lat"] < 23.044:
                data_row['lat'] = 23.035865
                data_row['lon'] = 113.157705
            elif data_row["lat"] > 23.044:
                data_row['lat'] = 23.045572
                data_row['lon'] = 113.164823
            else:
                raise Exception(str(data_row.index) + " not belong to any location in ETAR7002-056")

        elif name == "ETAR7002-063":
            if data_row["lat"] < 23.053:
                data_row['lat'] = 23.050539
                data_row['lon'] = 113.135121
            elif data_row["lat"] > 23.053:
                data_row['lat'] = 23.054445
                data_row['lon'] = 113.141845
            else:
                raise Exception(str(data_row.index) + " not belong to any location in ETAR7002-063")

        elif name == "ETAR7002-064":
            data_row['lat'] = 23.057191
            data_row['lon'] = 113.142105

        elif name == "ETAR7002-069":
            data_row['lat'] = 23.035945
            data_row['lon'] = 113.142722

        elif name == "ETAR7002-084":
            if data_row["lat"] < 23.053:
                data_row['lat'] = 23.050543
                data_row['lon'] = 113.135110
            elif data_row["lat"] > 23.053:
                data_row['lat'] = 23.054443
                data_row['lon'] = 113.141841
            else:
                raise Exception(str(data_row.index) + " not belong to any location in ETAR7002-084")

        elif name == "ETAR7002-102":
            data_row['lat'] = 23.051252
            data_row['lon'] = 113.125344

        elif name == "ETAR7002-103":
            data_row['lat'] = 23.029907
            data_row['lon'] = 113.149094

        elif name == "ETAR7002-110":
            data_row['lat'] = 23.044381
            data_row['lon'] = 113.142363

        elif name == "ETAR7002-125":
            if data_row["lat"] < 23.062:
                data_row['lat'] = 23.060325
                data_row['lon'] = 113.157607
            elif data_row["lat"] > 23.062:
                data_row['lat'] = 23.063979
                data_row['lon'] = 113.139616
            else:
                raise Exception(str(data_row.index) + " not belong to any location in ETAR7002-125")

        elif name == "ETAR7002-126":
            if data_row["lat"] < 23.035:
                data_row['lat'] = 23.024730
                data_row['lon'] = 113.131095
            elif data_row["lat"] > 23.035:
                data_row['lat'] = 23.048606
                data_row['lon'] = 113.156814
            else:
                raise Exception(str(data_row.index) + " not belong to any location in ETAR7002-126")

        elif name == "ETAR7002-060":
            if data_row["lat"] < 23.028:
                data_row['lat'] = 23.024647
                data_row['lon'] = 113.130731
            elif data_row["lat"] < 23.035:
                data_row['lat'] = 23.029957
                data_row['lon'] = 113.149148
            elif data_row["lat"] < 23.045:
                data_row['lat'] = 23.044371
                data_row['lon'] = 113.142512
            elif data_row["lat"] < 23.049:
                data_row['lat'] = 23.048530
                data_row['lon'] = 113.156843
            elif data_row["lat"] > 23.049:
                data_row['lat'] = 23.050547
                data_row['lon'] = 113.135107
            else:
                raise Exception(str(data_row.index) + " not belong to any location in ETAR7002-060")

        elif name == "ETAR7002-117":
            if data_row["lat"] < 23.028:
                data_row['lat'] = 23.024535
                data_row['lon'] = 113.130195
            elif data_row["lat"] < 23.035:
                data_row['lat'] = 23.029941
                data_row['lon'] = 113.149138
            elif data_row["lat"] < 23.045:
                data_row['lat'] = 23.044420
                data_row['lon'] = 113.142525
            elif data_row["lat"] < 23.049:
                data_row['lat'] = 23.048528
                data_row['lon'] = 113.156816
            elif data_row["lat"] > 23.049:
                data_row['lat'] = 23.054515
                data_row['lon'] = 113.141646
            else:
                raise Exception(str(data_row.index) + " not belong to any location in ETAR7002-117")
        else:
            raise Exception('Static device data is not processed for location reassignment')
    elif city == "Tianjin":
        if name == "868323028598404" or name == "868323028580022":
            data_row['lat'] = 39.11817
            data_row['lon'] = 117.73345
        elif name == "868323028622949":
            data_row['lat'] = 39.11427
            data_row['lon'] = 117.75207
        elif (name == "868323028616990") or (name == "868323028595350") or (name == "868323028619473"):
            data_row['lat'] = 39.164307
            data_row['lon'] = 117.743876
        elif name == "868323028624655":
            data_row['lat'] = 39.15797
            data_row['lon'] = 117.76360
        elif name == "868323028591797":
            data_row['lat'] = 39.105499
            data_row['lon'] = 117.719206
        elif name == "868323028610456":
            data_row['lat'] = 39.143889
            data_row['lon'] = 117.731185
        elif name == "868323028601950":
            if data_row['lon'] < 117.75:
                data_row['lat'] = 39.117534
                data_row['lon'] = 117.733440
            elif data_row['lon'] > 117.75:
                data_row['lat'] = 39.205562
                data_row['lon'] = 117.778882
            else:
                raise Exception(str(data_row.index) + " not belong to any location in 汉北路D泵站")
        elif (name == "868323028589270") or (name == "868323028618657") or (name == "868323028616073"):
            data_row['lat'] = 39.184135
            data_row['lon'] = 117.771626
        else:
            raise Exception('Static device data is not processed for location reassignment')
    else:
        raise Exception('Select incorrect city')
    return data_row


In [None]:
# List of Options
Cities = ["Cangzhou", "Foshan", "Tianjin"]
Type_of_Device = ["Static", "Mobile"]
#---------------------------Only Modify the following section-----------------------------------------
# Choose which folder to process the cleaning algorithm

# Modify * in Cities[*] to choose which city's data to process:
# 0 represents "Cangzhou", 1 represents "Foshan", 2 represents "Tianjin"

# Modift * in Type_of_Device[*] to choose which device type's data to process:
# 0 represents static device, 1 represents mobile device

chosen_city = Cities[*]
chosen_type = Type_of_Device[*]

#-------------------------------------------------------------------------------------------------------
# Process the data cleaning algorithm
directory = "./" + chosen_city + "/" + chosen_type + "/"
for devid in os.listdir(directory):
    if devid[-3:] != "csv":
        continue
    print("Processing file " + devid)
    data = pd.read_csv(directory + devid, index_col=[0], parse_dates=[0])
    processed_data = cleaning(data, devid[:-4], chosen_city, chosen_type)
#---------------------------Execute this cell after finish the modification-----------------------------------------