In [1]:
import numpy as np

import utilities.constants as const
from core.TrajectoryDF import NumPandasTraj as NumTrajDF
from features.spatial_features import SpatialFeatures as spatial
from features.temporal_features import TemporalFeatures as temporal
from utilities.conversions import Conversions as con
from preprocessing.filters import Filters as filters
from utilities.DistanceCalculator import DistanceFormulaLog as calc
from features.helper_functions import Helpers as helpers

import pandas as pd
import time
np.seterr(invalid='ignore')
start = time.time()

In [2]:
# %%time
#
# Reading the geolife dataset and converting to NumPandasTraj.
geolife = pd.read_csv('./data/geolife_sample.csv')
geolife = NumTrajDF(geolife,'lat','lon','datetime','id')
#
# Reading the gulls dataset and converting to NumPandasTraj.
gulls = pd.read_csv('./data/gulls.csv')
gulls = NumTrajDF(gulls,
                 latitude='location-lat',
                 longitude='location-long',
                 datetime='timestamp',
                 traj_id='tag-local-identifier',
                 rest_of_columns=[])


# Reading the atlantic dataset, cleaning it up and
# then converting it to NumPandasTraj.
atlantic = pd.read_csv('./data/atlantic.csv')
atlantic = con.convert_directions_to_degree_lat_lon(atlantic, 'Latitude',"Longitude")
def convert_to_datetime(row):
        this_date = '{}-{}-{}'.format(str(row['Date'])[0:4], str(row['Date'])[4:6], str(row['Date'])[6:])
        this_time = '{:02d}:{:02d}:00'.format(int(row['Time']/100), int(str(row['Time'])[-2:]))
        return '{} {}'.format(this_date, this_time)
atlantic['DateTime'] = atlantic.apply(convert_to_datetime, axis=1)
atlantic = NumTrajDF(atlantic,
                         latitude='Latitude',
                         longitude='Longitude',
                         datetime='DateTime',
                         traj_id='ID',
                         rest_of_columns=[])
atlantic = atlantic.drop(columns='Time')
atlantic = temporal.create_date_column(atlantic)
atlantic.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Date,Event,Status,lat,lon,Maximum Wind,Minimum Pressure,Low Wind NE,Low Wind SE,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
DateTime,traj_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1851-07-05 12:00:00,AL021851,UNNAMED,1851-07-05,,HU,22.2,-97.6,80,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1851-07-10 12:00:00,AL031851,UNNAMED,1851-07-10,,TS,12.0,-60.0,50,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1853-08-05 12:00:00,AL011853,UNNAMED,1853-08-05,,TS,32.5,-69.0,50,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1853-08-10 12:00:00,AL021853,UNNAMED,1853-08-10,,TS,12.0,-60.0,40,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1853-09-21 12:00:00,AL051853,UNNAMED,1853-09-21,,TS,20.0,-95.0,50,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [3]:
# %%time

#Now, let create a bounding box of 100 km radius from the
#coordinates 39, 116.

bbox = filters.get_bounding_box_by_radius(39, 116, 100000)
bbox

(38.100678394081264, 114.84275815636957, 39.89932160591873, 117.15724184363044)

In [4]:
# Now, lets filter the trajectory based on date. We will
# try all the possible combinations for the filtering.

small = filters.filter_by_date(atlantic, start_date='1851-06-25',end_date='2011-01-01')
print(f"Length of atlantic: {len(atlantic)}")
print(f"Length of small: {len(small)}")

Length of atlantic: 49105
Length of small: 46909


In [5]:
# Now, lets filter the trajectory based on datetime. We will
# try all the possible cases one by one.

tiny = filters.filter_by_datetime(atlantic, start_dateTime='1859-09-21 23:00:00' ,
                                  end_dateTime='2011-09-21 23:00:00')
print(f"Length of atlantic: {len(atlantic)}")
print(f"Length of tiny: {len(tiny)}")
atlantic.head()

Length of atlantic: 49105
Length of tiny: 46536


Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Date,Event,Status,lat,lon,Maximum Wind,Minimum Pressure,Low Wind NE,Low Wind SE,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
DateTime,traj_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1851-07-05 12:00:00,AL021851,UNNAMED,1851-07-05,,HU,22.2,-97.6,80,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1851-07-10 12:00:00,AL031851,UNNAMED,1851-07-10,,TS,12.0,-60.0,50,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1853-08-05 12:00:00,AL011853,UNNAMED,1853-08-05,,TS,32.5,-69.0,50,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1853-08-10 12:00:00,AL021853,UNNAMED,1853-08-10,,TS,12.0,-60.0,40,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1853-09-21 12:00:00,AL051853,UNNAMED,1853-09-21,,TS,20.0,-95.0,50,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [6]:
# Now, lets filter the dataframe based on maximum speed.

atlantic = spatial.create_speed_from_prev_column(atlantic)
max_speed_filt_df = filters.filter_by_max_speed(atlantic, 10)
print(f"Length of atlantic: {len(atlantic)}")
print(f"Length of speed_filt_df: {len(max_speed_filt_df)}")
atlantic.head()

Length of atlantic: 49105
Length of speed_filt_df: 41356


Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Date,Event,Status,lat,lon,Maximum Wind,Minimum Pressure,Low Wind NE,Low Wind SE,...,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW,Distance_prev_to_curr,Speed_prev_to_curr
DateTime,traj_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1851-07-05 12:00:00,AL021851,UNNAMED,1851-07-05,,HU,22.2,-97.6,80,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,,
1851-07-10 12:00:00,AL031851,UNNAMED,1851-07-10,,TS,12.0,-60.0,50,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,,
1853-08-05 12:00:00,AL011853,UNNAMED,1853-08-05,,TS,32.5,-69.0,50,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,,
1853-08-10 12:00:00,AL021853,UNNAMED,1853-08-10,,TS,12.0,-60.0,40,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,,
1853-09-21 12:00:00,AL051853,UNNAMED,1853-09-21,,TS,20.0,-95.0,50,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,,


In [7]:
# Now, lets filter the dataframe based on minimum speed.

min_speed_filt = filters.filter_by_min_speed(max_speed_filt_df, 5)
print(f"Length of speed_filt_df: {len(max_speed_filt_df)}")
print(f"Length of min_speed_filt: {len(min_speed_filt)}")

Length of speed_filt_df: 41356
Length of min_speed_filt: 20485


In [8]:
# Now, lets filter the dataframe based on minimum distance
# between consecutive points.

min_distance_filt = filters.filter_by_min_consecutive_distance(atlantic,
                                                               125000)
print(f"length of atlantic: {len(atlantic)}")
print(f"length of min_distance_filt: {len(min_distance_filt)}")

length of atlantic: 49105
length of min_distance_filt: 20584


In [9]:
# Now, lets filter the dataframe based on maximum distance
# between consecutive points.

max_distance_filt = filters.filter_by_max_consecutive_distance(min_distance_filt,
                                                               500000)
print(f"length of min_distance_filt: {len(min_distance_filt)}")
print(f"length of max_distance_filt: {len(max_distance_filt)}")

length of min_distance_filt: 20584
length of max_distance_filt: 20412


In [10]:
# Now, lets filter the data based on maximum speed as
# well as maximum distance between 2 consecutive points.

max_dist_speed_filt = \
    filters.filter_by_max_distance_and_speed(atlantic, max_distance=300000, max_speed=5)
print(f"length of atlantic: {len(atlantic)}")
print(f"length of max_dist_speed_filt: {len(max_dist_speed_filt)}")
print(max_dist_speed_filt['Distance_prev_to_curr'].max())
print(max_dist_speed_filt['Speed_prev_to_curr'].max())

length of atlantic: 49105
length of max_dist_speed_filt: 20871
107991.62581248698
4.999612306133656


In [11]:

# Now, lets filter the data based on minimum speed as
# well as minimum distance between 2 consecutive points.

min_dist_speed_filt = \
    filters.filter_by_min_distance_and_speed(atlantic, min_distance=150000, min_speed=10)
print(f"length of atlantic: {len(atlantic)}")
print(f"length of max_dist_speed_filt: {len(min_dist_speed_filt)}")
print(min_dist_speed_filt['Distance_prev_to_curr'].min())
print(min_dist_speed_filt['Speed_prev_to_curr'].min())

length of atlantic: 49105
length of max_dist_speed_filt: 5773
150474.93590679907
10.000128930993524


In [12]:
# Now, lets remove the outliers based on the
# distance between 2 consecutive points.

geolife = spatial.create_speed_from_prev_column(geolife)
outlier_df = filters.filter_outliers_by_consecutive_distance(geolife)
print(f"length of geolife: {len(geolife)}")
print(f"length of outlier_df: {len(outlier_df)}")
print(f"Number of outliers: {len(geolife) - len(outlier_df)}")

length of geolife: 217653
length of outlier_df: 212126
Number of outliers: 5527


In [13]:
odf_two = filters.filter_outliers_by_consecutive_speed(geolife)
print(f"length of geolife: {len(geolife)}")
print(f"length of outlier_df: {len(odf_two)}")
print(f"Number of outliers: {len(geolife) - len(odf_two)}")

length of geolife: 217653
length of outlier_df: 195280
Number of outliers: 22373


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Distance_prev_to_curr,Speed_prev_to_curr
DateTime,traj_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-10-23 16:53:05,1,39.984094,116.319236,,
2008-10-23 16:53:06,1,39.984198,116.319322,13.690153,13.690153
2008-10-23 05:53:11,1,39.984224,116.319402,7.403788,0.000158
2008-10-23 05:53:16,1,39.984211,116.319389,1.821083,0.364217
2008-10-23 05:53:21,1,39.984217,116.319422,2.889671,0.577934
