In [2]:
#Load the pandas library needed for dataframes
import pandas as pd
import os
import numpy as np
import csv
from math import sin, cos, sqrt, atan2, radians

In [7]:
#Load the _roads.tcv file: low_memory and removing the first row to help load the file. 
df_roads = pd.read_csv("_roads.tcv", sep="\t", low_memory = False, skiprows = 1, header = None)

In [8]:
#Check if everything is loaded correctly
df_roads.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4035,4036,4037,4038,4039,4040,4041,4042,4043,4044
0,N1,LRPS,23.706028,90.443333,LRPSa,23.702917,90.450417,LRPSb,23.702778,90.450472,...,92.29825,LRP466c,20.864667,92.298194,LRP467,20.862972,92.298083,LRPE,20.862917,92.298083
1,N101,LRPS,23.454139,91.212861,LRPSa,23.461889,91.212,LRP001,23.462944,91.211806,...,,,,,,,,,,
2,N102,LRPS,23.478972,91.118194,LRPSa,23.481583,91.116777,LRPSb,23.486666,91.113361,...,,,,,,,,,,
3,N103,LRPS,23.957028,91.115528,LRP001,23.961917,91.113611,LRP001a,23.967666,91.111889,...,,,,,,,,,,
4,N104,LRPS,23.009667,91.399416,LRPSa,23.009278,91.39525,LRP001,23.009306,91.389805,...,,,,,,,,,,


In [5]:
#Small df created to test on. 
# df_test = df_roads.iloc[0:5,:]

In [6]:
# df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4035,4036,4037,4038,4039,4040,4041,4042,4043,4044
0,N1,LRPS,23.706028,90.443333,LRPSa,23.702917,90.450417,LRPSb,23.702778,90.450472,...,92.29825,LRP466c,20.864667,92.298194,LRP467,20.862972,92.298083,LRPE,20.862917,92.298083
1,N101,LRPS,23.454139,91.212861,LRPSa,23.461889,91.212,LRP001,23.462944,91.211806,...,,,,,,,,,,
2,N102,LRPS,23.478972,91.118194,LRPSa,23.481583,91.116777,LRPSb,23.486666,91.113361,...,,,,,,,,,,
3,N103,LRPS,23.957028,91.115528,LRP001,23.961917,91.113611,LRP001a,23.967666,91.111889,...,,,,,,,,,,
4,N104,LRPS,23.009667,91.399416,LRPSa,23.009278,91.39525,LRP001,23.009306,91.389805,...,,,,,,,,,,


In [17]:
def replace_outlier(row):
    """This function is used to find the outlier based on the longitude and latitude values. 
    It checks if the point falls between the points taking into account the directions it can go. """
    
    #Set where the first latitudes and longitudes values are available. 
    firstlat = 2
    firstlon = 3
    
    #Loop through the row (ignoring NaN values)
    while (firstlat + 6) <= len(df_roads.iloc[row.name,:].dropna()):
        
        #Set the latitude and longitude values, with a being the previous point, c being the next point, and b 
        #the point that should fall inbetween
        lat_a = df_roads.iloc[row.name, firstlat]
        lon_a = df_roads.iloc[row.name, firstlon]
        lat_b = df_roads.iloc[row.name, firstlat + 3]
        lon_b = df_roads.iloc[row.name, firstlon + 3]
        lat_c = df_roads.iloc[row.name, firstlat + 6]
        lon_c = df_roads.iloc[row.name, firstlon + 6]
        
        #Check if point b falls inbetween a and c for latitudes 
        #while taking positive and negative directions into account
        if (lat_a > lat_b) & (lat_b > lat_c):
            pass
        elif (lat_a < lat_b) & (lat_b < lat_c):
            pass
        else:
            #Replace the point by taking the middle point between a and c
            half_lat = (lat_a + lat_c) / 2
            df_roads.iloc[row.name, firstlat + 3] = half_lat
            
        #Check if point b falls inbetween a and c for latitudes 
        #while taking positive and negative directions into account
        if (lon_a > lon_b) & (lon_b > lon_c):
            pass
        elif (lon_a < lon_b) & (lon_b < lon_c):
            pass
        else:
            #Replace the point by taking the middle point between a and c
            half_lon = (lon_a + lon_c) / 2
            df_roads.iloc[row.name, firstlon + 3] =  half_lon
            
        #Check the next point
        firstlat += 3
        firstlon += 3

In [18]:
#Apply the function replace_outlier depending how much you would like to iterate the process
#Runtime will be longer if the iterations are higher (Warning: this took very long 15min+!)
number_iterations = 5

In [19]:
for i in range(number_iterations):
    _ = df_roads.apply(replace_outlier, axis = 1)

In [20]:
#Check if LRP013c for Z1813 is fixed.
df_roads.iloc[391, 100:120]

100    LRP012e
101    22.1973
102    92.2949
103     LRP013
104    22.1976
105    92.2968
106    LRP013a
107    22.1978
108    92.2971
109    LRP013b
110    22.1978
111    92.2972
112    LRP013c
113    22.1978
114    92.2979
115    LRP013d
116    22.1977
117    92.2986
118    LRP013e
119    22.1976
Name: 391, dtype: object

In [21]:
#Quick fix as there were issues with exporting data values if the dataframe contained NaN values. 
df_roads.fillna(axis=1, method='ffill')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4035,4036,4037,4038,4039,4040,4041,4042,4043,4044
0,N1,LRPS,23.706,90.4433,LRPSa,23.7029,90.4504,LRPSb,23.7028,90.4505,...,92.2982,LRP466c,20.8647,92.2981,LRP467,20.863,92.2981,LRPE,20.8629,92.2981
1,N101,LRPS,23.4541,91.2129,LRPSa,23.4619,91.213,LRP001,23.4629,91.2132,...,91.2534,91.2534,91.2534,91.2534,91.2534,91.2534,91.2534,91.2534,91.2534,91.2534
2,N102,LRPS,23.479,91.1182,LRPSa,23.4816,91.1168,LRPSb,23.4867,91.1134,...,91.1147,91.1147,91.1147,91.1147,91.1147,91.1147,91.1147,91.1147,91.1147,91.1147
3,N103,LRPS,23.957,91.1155,LRP001,23.9619,91.1136,LRP001a,23.9677,91.1119,...,91.1093,91.1093,91.1093,91.1093,91.1093,91.1093,91.1093,91.1093,91.1093,91.1093
4,N104,LRPS,23.0097,91.3994,LRPSa,23.0096,91.3952,LRP001,23.0096,91.3898,...,91.1014,91.1014,91.1014,91.1014,91.1014,91.1014,91.1014,91.1014,91.1014,91.1014
5,N105,LRPS,23.6904,90.5466,LRP001,23.6984,90.5511,LRP001a,23.6993,90.5515,...,90.3582,90.3582,90.3582,90.3582,90.3582,90.3582,90.3582,90.3582,90.3582,90.3582
6,N106,LRPS,22.3936,91.8216,LRPSa,22.3975,91.82,LRPSb,22.3991,91.8198,...,92.2018,92.2018,92.2018,92.2018,92.2018,92.2018,92.2018,92.2018,92.2018,92.2018
7,N107,LRPS,22.3944,91.8918,LRPSa,22.3922,91.8925,LRPSb,22.3919,91.8931,...,91.9236,91.9236,91.9236,91.9236,91.9236,91.9236,91.9236,91.9236,91.9236,91.9236
8,N108,LRPS,22.1021,92.0741,LRPSa,22.1038,92.0824,LRP001,22.104,92.0836,...,92.2176,92.2176,92.2176,92.2176,92.2176,92.2176,92.2176,92.2176,92.2176,92.2176
9,N109,LRPS,21.4434,92.1015,LRPSa,21.4423,92.1017,LRPSb,21.4419,92.1019,...,92.0953,92.0953,92.0953,92.0953,92.0953,92.0953,92.0953,92.0953,92.0953,92.0953


In [23]:
#Save updated file under a new name. Remember to rename for the java program!
df_roads.to_csv("_roads_new.tcv", sep="\t", index=False, header=False)