1.Load Dependencies

In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from math import sqrt
import random
import matplotlib.pyplot as plt


2. Load Data

In [15]:
%%time
# https://drive.google.com/file/d/1-RXYMh61x9asaVU_w5tDHtq0CMmp_Rdn/view?usp=sharing
file_path_train = 'D:\MiniProject_P3_fitrec\TrainData.csv'
# https://drive.google.com/file/d/1--znLUhuYqoMMqQo1Gnf6WafrzzniIFL/view?usp=sharing
file_path_test = 'D:\MiniProject_P3_fitrec\TestData.csv'


train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)

print(train_df.shape)
display(train_df.head(1))

(50253, 28)


Unnamed: 0,id,userId,gender,sport,duration,calories,distance,avg_heart_rate,longitude,latitude,...,validate,avg_alti,change_alti,max_alti,min_alti,diff_alti,avg_speed,Cluster,Route,Route_id
0,611012078,2568526,male,run,3158,830.588,10.02,154.914,"[7.099486151710153, 7.0994688011705875, 7.0993...","[43.68301374837756, 43.683006623759866, 43.682...",...,True,87.8552,756.8,139.4,76.0,63.4,11.574343,1,"('run', 1)",9


Wall time: 1min 6s


3.Find workout records that return to start point at the end

In [16]:
# Define function to compute distance based on Latitude, Longitude, Altitude
def geodis(lat_0, lon_0, alt_0, lat_1, lon_1, alt_1):
    dis = geodesic((lat_0, lon_0), (lat_1, lon_1)).km
    dis = sqrt(dis**2 + (alt_0/1000-alt_1/1000)**2)
    return dis

# Define function to check if a workout record has returned to start point at the end


def isback(df_row, num_to_check):


    '''
    df_row: a row of dataframe
    num_to_check: number of points to check

    1. We take num_to_check points at the beginning of workout route and num_to_check points
    at the end of the workout route

    2. We compute the distances between each point at the beginning with all points at the end respectively

    3. If one distance is smaller than threshold, then we return 1 else we return 0

    '''

    if df_row.sport == 'run':
        thres = 0.02
    else:
        thres = 0.04

    lat_head = eval(df_row.latitude)[0:num_to_check]
    lon_head = eval(df_row.longitude)[0:num_to_check]
    alt_head = eval(df_row.altitude)[0:num_to_check]
    lat_tail = eval(df_row.latitude)[-num_to_check:]
    lon_tail = eval(df_row.longitude)[-num_to_check:]
    alt_tail = eval(df_row.altitude)[-num_to_check:]

    dis_list = []

    for i in range(0, num_to_check):
        dis = [geodis(lat_head[i], lon_head[i], alt_head[i], lat_tail[j],
                      lon_tail[j], alt_tail[j]) for j in range(0, num_to_check)]
        dis_list.extend(dis)

    if min(dis_list) < thres:
        return 1
    else:
        return 0

In [17]:
%%time
train_df['isback'] = train_df.apply(lambda x: isback(x, 5), axis=1)
test_df['isback'] = test_df.apply(lambda x: isback(x, 5), axis=1)

print(train_df.isback.value_counts())

0    29470
1    20783
Name: isback, dtype: int64
Wall time: 9min 42s


4.EXtend workout routes

In [18]:
# Extract rows where workout route has returned to starting point at the end

adjust_train_df = train_df[train_df.isback==1].copy()
adjust_test_df = test_df[test_df.isback==1].copy()

In [19]:
# Define function to create new sequential features

def update_sequence(df_row, max_extend_point):


    '''
    df_row: a row of dataframe
    max_extend_point: maximum number of points to extend


    1. We randomly draw a number as the number of points to extend:
    ext_len

    2. We extract the number of points from start of workout sequence:
    lat_head, lon_head, alt_head, distance_head

    3. We generate Gaussian noise and add to the latitude and longitude of the
    extracted sequence:
    lat_head_noise, lon_head_noise

    4. We re-calculate distance sequence due to added noise to latitude and longitude

    5. Because we will extend a number of points to sequence data, we also randomly
    remove same number of points from original sequence so that the sequence length
    is unchanged:
    tmp_alt, tmp_distance

    6. For dropped points, we re-calculate the distance and speed of neighbouring point

    7. For altitude and distance, we concatenate extracted points from beginning of
    sequence and original sequence with points randomly dropped:
    alt_head+tmp_alt, distance_head+tmp_distance

    8. For speed, heart rate, we keep original sequence with points randomly dropped:
    tmp_heart, tmp_speed

    9. We also return the index of the last point from the original sequence:
    complete_idx
    '''

    # Randonly draw the number of points to extend
    ext_len = random.randrange(30, max_extend_point)

    # Extract number of points from start of workout sequence
    lat_head = eval(df_row.latitude)[0:ext_len]
    lon_head = eval(df_row.longitude)[0:ext_len]
    alt_head = eval(df_row.altitude)[0:ext_len]
    distance_head = eval(df_row.derived_distance)[0:ext_len]

    # Generate Gaussian noise
    max_noise_lat = np.absolute(np.array(lat_head).mean()/100000000.)
    max_noise_lon = np.absolute(np.array(lat_head).mean()/100000000.)

    noise_lat = np.random.normal(0, max_noise_lat, ext_len)
    noise_lon = np.random.normal(0, max_noise_lon, ext_len)

    # Add Gaussian noise to latitude and longitude of extended route
    lat_head_noise = np.add(lat_head, noise_lat)
    lon_head_noise = np.add(lon_head, noise_lon)

    # Update distance array based on new latitude and longitude with noise
    dis_tail = distance_head[-1]
    distance_head = np.array([geodis(lat_head_noise[idx], lon_head_noise[idx], alt_head[idx],
                                    lat_head_noise[idx+1], lon_head_noise[idx+1], alt_head[idx+1]) for idx in range(len(distance_head)-1)])
    distance_head = np.append(distance_head, dis_tail)


    # Sample indices to drop from original route
    # we don't want to touch the head and tail point
    drop_indices = random.sample(range(1, 498), ext_len)

    # Adjust distance and speed due to dropped points

    # Get value from each cell for each feature
    tmp_lat = eval(df_row.latitude)
    tmp_lon = eval(df_row.longitude)
    tmp_alt = eval(df_row.altitude)
    tmp_heart = eval(df_row.heart_rate)
    tmp_speed = eval(df_row.derived_speed)
    tmp_distance = eval(df_row.derived_distance)
    tmp_timestamp = eval(df_row.timestamp)

    tmp_df = pd.DataFrame(data=[tmp_lat[:499],
                                tmp_lon[:499],
                                tmp_alt[:499],
                                tmp_heart[:499],
                                tmp_speed[:499],
                                tmp_distance[:499],
                                tmp_timestamp[:499]]).T

    tmp_df.rename(columns={0: 'latitude',
                            1: 'longitude',
                            2: 'altitude',
                            3: 'heart_rate',
                            4: 'derived_speed',
                            5: 'derived_distance',
                            6: 'timestamp'}, inplace=True)

    # Adjust distance and speed due to dropped points
    for idx in drop_indices:

        # Find idx of previous row in case the row is already deleted
        prev_idx = idx-1
        while prev_idx not in tmp_df.index:
            prev_idx -= 1

        # Find idx of next row in case the row is already deleted
        next_idx = idx+1
        while next_idx not in tmp_df.index:
            next_idx += 1

        # idx point will be deleted, we add idx point distance to the distance at previous point
        tmp_df.loc[prev_idx, 'derived_distance'] += tmp_df.loc[idx,
                                                                'derived_distance']
        # Re-calculate speed based on new distance for previous point
        tmp_df.loc[prev_idx, 'derived_speed'] = tmp_df.loc[prev_idx, 'derived_distance'] / \
            ((tmp_df.loc[next_idx, 'timestamp'] -
                tmp_df.loc[prev_idx, 'timestamp'])/3600)
        # Drop row at idx point
        tmp_df.drop([idx], inplace=True)

    # Get reduced feature arrays
    tmp_lat = tmp_df.latitude.to_numpy()
    tmp_lon = tmp_df.longitude.to_numpy()
    tmp_alt = tmp_df.altitude.to_numpy()
    tmp_heart = tmp_df.heart_rate.to_numpy()
    tmp_speed = tmp_df.derived_speed.to_numpy()
    tmp_distance = tmp_df.derived_distance.to_numpy()

    # Store idx where original workout completes
    complete_idx = tmp_lat.shape[0]-1

    # Update distance between last point of original workout route to first point of extended route
    tmp_distance[-1] = geodis(tmp_lat[-1], tmp_lon[-1], tmp_alt[-1],
                              lat_head_noise[0], lon_head_noise[0], alt_head[0])

    # Extend altitude sequence
    tmp_alt = np.append(tmp_alt, alt_head)
    
    # Extend distance sequence
    tmp_distance = np.append(tmp_distance, distance_head)

    # Total distance
    tmp_distance_sum = np.sum(tmp_distance)

    return str(list(tmp_alt)), str(list(tmp_distance)), str(list(tmp_heart)), str(list(tmp_speed)), complete_idx, tmp_distance_sum

In [None]:
%%time

adjust_train_df['altitude_adjusted'], \
    adjust_train_df['distance_adjusted'], \
    adjust_train_df['heart_rate_adjusted'], \
    adjust_train_df['speed_adjusted'], \
    adjust_train_df['complete_idx'], \
    adjust_train_df['distance_adjusted_sum'] = zip(
        *adjust_train_df.apply(lambda x: update_sequence(x, 100), axis=1))