In [1]:
#Load the package libraries needed
import pandas as pd
import os
import numpy as np
import csv

from pandas import ExcelWriter
from pandas import ExcelFile

In [2]:
#Import Roads_InfoAboutEachLRP as the data file is easier to manipulate
df_roadsinfo = pd.read_csv('Roads_InfoAboutEachLRP.csv')

In [3]:
#Small overview of the file to see if it is imported correctly
df_roadsinfo.head()

Unnamed: 0,road,chainage,lrp,lat,lon,type,name
0,N1,0.0,LRPS,23.706028,90.443333,Others,Start of Road after Jatrabari Flyover infront...
1,N1,0.814,LRPSa,23.702917,90.450417,Culvert,Box Culvert
2,N1,0.822,LRPSb,23.702778,90.450472,CrossRoad,Intersection with Z1101
3,N1,1.0,LRP001,23.702139,90.451972,KmPost,Km post missing
4,N1,2.0,LRP002,23.697889,90.460583,KmPost,Km post missing


In [4]:
#Use chainage to create two new columns (for longitude and latitude) which gives the absolute difference between them
#Grouped by road name so that it will not create a difference between different roads
#Output made in absolute values
#Fill missing values with 0
df_roadsinfo["lat_diff"] = df_roadsinfo.groupby("road")["lat"].diff().abs().fillna(0)
df_roadsinfo["lon_diff"] = df_roadsinfo.groupby("road")["lon"].diff().abs().fillna(0)

df_roadsinfo.head()

Unnamed: 0,road,chainage,lrp,lat,lon,type,name,lat_diff,lon_diff
0,N1,0.0,LRPS,23.706028,90.443333,Others,Start of Road after Jatrabari Flyover infront...,0.0,0.0
1,N1,0.814,LRPSa,23.702917,90.450417,Culvert,Box Culvert,0.003111,0.007084
2,N1,0.822,LRPSb,23.702778,90.450472,CrossRoad,Intersection with Z1101,0.000139,5.6e-05
3,N1,1.0,LRP001,23.702139,90.451972,KmPost,Km post missing,0.000639,0.0015
4,N1,2.0,LRP002,23.697889,90.460583,KmPost,Km post missing,0.00425,0.008611


In [5]:
#Use the lat and lon differences to create boundary limits based per road
#The boundary is set to 1.96 the standard deviation
std_lat = df_roadsinfo.groupby(["road"])["lat_diff"].std()*1.96
std_lon = df_roadsinfo.groupby(["road"])["lon_diff"].std()*1.96

In [6]:
#Check how it looks (the boundary differences per road)
std_lat

road
N1       0.170943
N101     0.008024
N102     0.406995
N103     0.005190
N104     0.008686
N105     0.005488
N106     0.003181
N107     0.003846
N108     0.002942
N109     0.003896
N110     0.001802
N111     0.291729
N112     0.005093
N119     0.003328
N120     0.004822
N123     0.004815
N124     0.009817
N125     0.005681
N126     0.005681
N127     0.001296
N128     0.002804
N129     0.003525
N2       0.522193
N203     0.001580
N204     0.056393
N205     0.005865
N206     0.002118
N207     0.224025
N208     0.008400
N209     0.002571
           ...   
Z8606    0.002070
Z8607    0.460692
Z8611    0.003418
Z8699    0.005600
Z8701    0.005443
Z8702    0.002823
Z8703    0.001148
Z8704    0.003928
Z8705    0.002868
Z8706    0.001403
Z8708    0.004945
Z8709    0.002186
Z8713    0.003417
Z8716    0.002875
Z8717    0.235083
Z8740    0.000873
Z8750    0.001827
Z8803    0.005408
Z8804    0.006576
Z8806    0.011021
Z8810    0.005458
Z8814    0.051855
Z8815    0.002934
Z8905    0.021887
Z8909

In [7]:
 def delete_outliers(row):
    """This function checks if the differences between LRPs are in the boundary. 
    If it falls outside the boundaries it will return delete. This is only to identify the outliers """
    
    #Set the boundaries depending on the road
    boundary_lat = std_lat.get(row.road)
    boundary_lon = std_lon.get(row.road)
    
    #Check if the absolute differences are inside the boundary, if not return delete showing that the point might be wrong
    if df_roadsinfo.iloc[row.name, 7] > boundary_lat:
        return "delete"
    if df_roadsinfo.iloc[row.name, 8] > boundary_lon:
        return "delete"

In [8]:
#Apply the function "delete_outliers"
df_roadsinfo["delete_c"] = df_roadsinfo.apply(delete_outliers, axis = 1)

In [9]:
#See if it works
df_roadsinfo

Unnamed: 0,road,chainage,lrp,lat,lon,type,name,lat_diff,lon_diff,delete_c
0,N1,0.000,LRPS,23.706028,90.443333,Others,Start of Road after Jatrabari Flyover infront...,0.000000,0.000000,
1,N1,0.814,LRPSa,23.702917,90.450417,Culvert,Box Culvert,0.003111,0.007084,
2,N1,0.822,LRPSb,23.702778,90.450472,CrossRoad,Intersection with Z1101,0.000139,0.000056,
3,N1,1.000,LRP001,23.702139,90.451972,KmPost,Km post missing,0.000639,0.001500,
4,N1,2.000,LRP002,23.697889,90.460583,KmPost,Km post missing,0.004250,0.008611,
5,N1,2.130,LRP002a,23.697361,90.461667,Culvert,Box culvert,0.000528,0.001083,
6,N1,3.000,LRP003,23.693833,90.469138,KmPost,Km post missing,0.003528,0.007472,
7,N1,4.000,LRP004,23.693611,90.478777,KmPost,Km post missing,0.000222,0.009639,
8,N1,4.175,LRP004a,23.693805,90.480527,"SideRoad,Right",Road to Narayanganj(R111),0.000194,0.001750,
9,N1,5.000,LRP005,23.694750,90.488500,KmPost,Km post missing,0.000944,0.007973,


In [10]:
#Simple solution to the outliers: delete the records which are classified as outliers.
df_roadsinfo = df_roadsinfo[df_roadsinfo["delete_c"] != "delete"]

In [11]:
df_roadsinfo = df_roadsinfo[["road", "chainage", "lrp", "lat", "lon", "type", "name"]]

df_roadsinfo.head()

Unnamed: 0,road,chainage,lrp,lat,lon,type,name
0,N1,0.0,LRPS,23.706028,90.443333,Others,Start of Road after Jatrabari Flyover infront...
1,N1,0.814,LRPSa,23.702917,90.450417,Culvert,Box Culvert
2,N1,0.822,LRPSb,23.702778,90.450472,CrossRoad,Intersection with Z1101
3,N1,1.0,LRP001,23.702139,90.451972,KmPost,Km post missing
4,N1,2.0,LRP002,23.697889,90.460583,KmPost,Km post missing


In [14]:
#Convert to the CSV file. Save updated file under a new name. Remember to rename for the java program!
df_roadsinfo.to_csv('Roads_InfoAboutEachLRP_new.csv', index=False)

In [13]:
#Make a smaller version of the file in order to make it easier to use this data for the _roads.tcv file. 
df_small_roadsinfo = df_roadsinfo[["road", "lrp", "lat", "lon"]]