# Final Clean

In [56]:
import pandas as pd
import datetime
import numpy as np
import itertools
from geopy.distance import vincenty
from ipykernel import kernelapp as app
%load_ext Cython
import numba
import gc
from haversine import haversine
global df

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [57]:
df = pd.read_hdf("cleaning_store.h5", key="table_name", where='Week_Day == 6')

In [58]:
categorical_columns = df[['Journey_Pattern_ID','Vehicle_Journey_ID', 'Bus_Operator',
                          'Block_ID', 'Vehicle_ID', 'Stop_ID']].columns

In [59]:
# Convert data type to category for these columns. categorical columns have better performance than object dtype
for column in categorical_columns:
    df[column] = df[column].astype('category')

In [60]:
false_categorical_columns = df[['Week_Day', 'At_Stop' ]].columns

In [61]:
# These columns are converted to int32 although they are categorical in order to have compatibility with hdf5
for column in false_categorical_columns:
    df[column] = df[column].astype('int32')

In [62]:
gc.collect()

843

In [63]:
df.reset_index(drop=True,inplace=True)

In [64]:
df.head()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Bus_Operator,Longitude,Latitude,Delay_seconds,Block_ID,Vehicle_ID,Stop_ID,At_Stop,Week_Day
0,2012-11-11 06:35:04,410001,2012-11-11,13421,SL,-6.25546,53.349075,-2,41001,33425,288,0,6
1,2012-11-11 06:35:20,410001,2012-11-11,13421,SL,-6.25516,53.349129,-2,41001,33425,288,0,6
2,2012-11-11 06:35:41,410001,2012-11-11,13421,SL,-6.253968,53.350094,-19,41001,33425,288,0,6
3,2012-11-11 06:36:03,410001,2012-11-11,13421,SL,-6.25415,53.350376,-19,41001,33425,1171,1,6
4,2012-11-11 06:36:23,410001,2012-11-11,13421,SL,-6.25415,53.350376,10,41001,33425,1171,1,6


In [65]:
df.dtypes

Timestamp             datetime64[ns]
Journey_Pattern_ID          category
Time_Frame            datetime64[ns]
Vehicle_Journey_ID          category
Bus_Operator                category
Longitude                    float64
Latitude                     float64
Delay_seconds                  int32
Block_ID                    category
Vehicle_ID                  category
Stop_ID                     category
At_Stop                        int32
Week_Day                       int32
dtype: object

#### Note About The Following Cell:
This is the way we are sorting the data. First the Timeframe is the most important since it holds a unique Vehicle Journey ID (or it should) for every journey each day. 
Ideally we would then sort by VehicleJourneyId but the reality is that the data is messy. In order to avoid two buses at opposite sides of the city causing issues with the dataset we will now sort by the vehicle ID.
Next the vehicle journey ID makes sense since we can then start to sort the entire dataset into individual journeys along a certain route.
Lastly timestamp is obvious.

We'll use this cell several times to keep the dataframe sorted and the index correct, as many of the loops in this notebook require this.

You should deduce from this that there will be several cases of a VehicleJourneyId being repeated. cleaning up this is part of the challenge.

## Remove General Noise

In [66]:
""" Remove every VehicleJourneyId which is equal to or below 5 rows in length. Realistically any journey (even the short ones)
should be at least 5 rows of data in length, anything else is just noise. """


df = df.groupby(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID'], as_index=False, group_keys=False, sort=False).filter(lambda x: len(x) > 5)


## Remove Extra JourneyPatternId's From VehicleJourneyId's

In [67]:
# Create group object to work with 
gb = df.groupby(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID'], as_index=False, group_keys=False, sort=False)

In [68]:
@numba.jit(nogil=True)
def delete_outlier_journeypatternid(group):
    """ Takes a pandas group object and iterates removing the least occuring JourneyPatterId in each.
    If there are more than two or the occurances of the two are equal, it ignores it. """
    
    grouped_values = group["Journey_Pattern_ID"].value_counts()
        
    # If there's two Journey Pattern ID's
    if len(grouped_values) == 2:
                
        # If the two journey pattern ID's occupy the same space then do nothing (likely it's a 'noise' journey)
        if grouped_values[0] != grouped_values[1]:
            
            real_id = grouped_values.index.tolist()[0] 
            group = group[group.Journey_Pattern_ID == real_id]
            
    return group

In [69]:
# Apply the mapping function to the dataset
%timeit df = gb.apply(delete_outlier_journeypatternid)
# 120 secs
# to 79 seconds with jit numba


1 loop, best of 3: 39.8 s per loop


In [70]:
# Create group object to work with 
gb = df.groupby(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID'], as_index=False, group_keys=False, sort=False)

In [71]:
# Delete outliers (journey's with 3 id's and other noise inc. nulls)
df = gb.filter(lambda x: len(x["Journey_Pattern_ID"].unique()) == 1)

# Clean blockID

In [72]:
gb = df.groupby(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID'], as_index=False, group_keys=False, sort=False)

In [73]:
@numba.jit(nogil=True)
def delete_outlier_block_id(group):
    """ Takes a pandas group object and iterates removing the least occuring Block ID in each.
    If there are more than two or the occurances of the two are equal, it ignores it. """
    
    # If there's two Journey Pattern ID's
    if len(group["Block_ID"].value_counts()) == 2:
        real_id = group["Block_ID"].value_counts().index.tolist()[0]
        group = group[group.Block_ID == real_id]
           
    return group

In [74]:
# Apply the mapping function to the dataset
%timeit df = gb.apply(delete_outlier_block_id)

# For debugging
df.shape

1 loop, best of 3: 39.9 s per loop


(2587877, 13)

## Remove Vehicle Journey ID's With Two Occurances Each Day

These can either be...
* Two buses completing the same route together with a stopover.
* Incomplete journey's.
* General noise in the data.

In any case they occupy less than 1% of the data and change week to week, so they can be dropped without overall loss of data integrity.

In [75]:
# Filter out these outliers

gb = df.groupby(["Time_Frame", "Vehicle_Journey_ID"], as_index=False, group_keys=False, sort=False)

df = gb.filter(lambda x: len(x["Vehicle_ID"].unique()) == 1)

# Clean Up Start Of Journeys

In [76]:
# Organise the Data
df.sort_values(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID', 'Timestamp'], ascending=True, inplace=True)

# Clean up index
df.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [77]:
df.dtypes

Timestamp             datetime64[ns]
Journey_Pattern_ID          category
Time_Frame            datetime64[ns]
Vehicle_Journey_ID          category
Bus_Operator                category
Longitude                    float64
Latitude                     float64
Delay_seconds                  int32
Block_ID                    category
Vehicle_ID                  category
Stop_ID                     category
At_Stop                        int32
Week_Day                       int32
dtype: object

In [78]:
df['Vehicle_Journey_ID'] = df['Vehicle_Journey_ID'].astype('object')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [79]:
@numba.jit(nogil=True)
def GPS_clean(df):
    
    """ If the GPS coordinates remain the same for the first few rows of data, delete them. """
    
    last_bus_id = df.loc[0, "Vehicle_Journey_ID"]
    last_lat = df.loc[0, "Latitude"]
    last_long = df.loc[0, "Longitude"]
    new_start = True
    
    for row in itertools.islice(df.itertuples(),1,None):

        # For every iteration
        current_bus_id = row[4]
        current_lat = row[7]
        current_long = row[6]

        # If it's a different vehicle journey id
        if last_bus_id != current_bus_id:
            last_bus_id = row[4]
            last_lat = row[7]
            last_long = row[6]
            new_start = True
            continue

        # If it's the same journey
        if new_start:
            if current_lat == last_lat:
                if current_long == last_long:
                    # Flag
                    df.set_value(row[0] - 1, "Vehicle_Journey_ID", 0)
                    last_bus_id = row[4]
                    last_lat = row[7]
                    last_long = row[6]
                    continue

        # If it's the same journey but it's moved
        if last_bus_id == current_bus_id:
            if current_lat != last_lat or current_long != last_long:
                new_start = False

In [80]:
%timeit GPS_clean(df)

1 loop, best of 3: 32.2 s per loop


In [81]:
#iterrows, original loop: 400 seconds
#numba just in time compiler +  itertuples + skipping first row: 25 seconds

In [82]:
# Filter Out Rows Flagged
df = df[df.Vehicle_Journey_ID != 0]

In [83]:
df['Vehicle_Journey_ID'] = df['Vehicle_Journey_ID'].astype('object')

In [84]:
# Organise the Data
df.sort_values(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID', 'Timestamp'], ascending=True, inplace=True)
# Clean up index
df.reset_index(drop=True,inplace=True)

In [85]:
df.shape

(2482231, 13)

## Add Distance Feature

Before dropping duplicate StopId's we must first measure the distance on each route. This requires that we use all rows of data.

This will also make any VehicleJourneyId's which miss a stop along their journey useful data in the model.

In [86]:
@numba.jit(nogil=True)
def get_distance(lat1, long1, lat2, long2):
    """ Get distance between two geo coordinates in km """
    
    stop1 = (lat1, long1)
    stop2 = (lat2, long2)
    
    return haversine(stop1,stop2)

In [87]:
df.dtypes

Timestamp             datetime64[ns]
Journey_Pattern_ID          category
Time_Frame            datetime64[ns]
Vehicle_Journey_ID            object
Bus_Operator                category
Longitude                    float64
Latitude                     float64
Delay_seconds                  int32
Block_ID                    category
Vehicle_ID                  category
Stop_ID                     category
At_Stop                        int32
Week_Day                       int32
dtype: object

In [88]:
@numba.jit(nogil=True)
def add_distance_todf(df):

    # List to hold feature
    df["Distance"] = np.nan
    
    # Set up values on first iteration
    last_id = df.loc[0, "Vehicle_Journey_ID"]
    last_lat = df.loc[0, "Latitude"]
    last_long = df.loc[0, "Longitude"]
    last_distance = 0
    
    #set very first distance
    df.set_value(0, "Distance", 0)

    for row in itertools.islice(df.itertuples(),1,None):
        
        current_id = row[4]
        current_lat = row[7]
        current_long = row[6]
        current_distance = get_distance(current_lat, current_long, last_lat, last_long)

        # If it's a new Journey ID
        if current_id != last_id:
            last_lat = row[7]
            last_long = row[6]
            last_id = row[4]
            last_distance = 0

            df.set_value(row[0], "Distance", 0)
            continue

        # If it's not a new Journey ID
        current_distance = get_distance(current_lat, current_long, last_lat, last_long)
        
        last_distance += current_distance
        
        df.set_value(row[0], "Distance", last_distance)
        last_lat = row[7]
        last_long = row[6]
        last_id = row[4]
        

In [89]:
%timeit add_distance_todf(df)

1 loop, best of 3: 1min 59s per loop


In [90]:
#original code: > 5 minutes, I stopped the loop and proceeded optimizing
# jit numba + itertuples + skip first row: 246 seconds

In [91]:
df.head()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Bus_Operator,Longitude,Latitude,Delay_seconds,Block_ID,Vehicle_ID,Stop_ID,At_Stop,Week_Day,Distance
0,2012-11-11 10:55:24,2380001,2012-11-11,14326,HN,-6.3782,53.419167,20,238002,24587,7073,1,6,0.0
1,2012-11-11 10:55:44,2380001,2012-11-11,14326,HN,-6.378497,53.419022,39,238002,24587,7073,0,6,0.025442
2,2012-11-11 10:55:46,2380001,2012-11-11,14326,HN,-6.378459,53.419052,39,238002,24587,7073,0,6,0.029622
3,2012-11-11 10:56:23,2380001,2012-11-11,14326,HN,-6.381803,53.418823,64,238002,24587,7073,0,6,0.252679
4,2012-11-11 10:56:43,2380001,2012-11-11,14326,HN,-6.384019,53.419331,64,238002,24587,7073,0,6,0.410018


## Remove Stop ID Duplicates

Now we can filter the dataframe a little by removing dupicate stopID's. Although it would be better to train on every row it might be too much to compute. 

This will also help us later in making the database stop distances. There is not enough information to map the exact distance to each stop, so we will have to take some kind of average of the AtStop == 0 columns and subtract a little to get a rough distance to each stop in a Journey Pattern ID.

Because this section keeps the first occurance of each StopId, this should be a very accurate way to estimate the distance to each stop on each route.

In [92]:
""" This will delete all duplicates AFTER the first example of each is found. So when the bus arrives at the stop, 
all subsequent rows at that stop will be deleted. """

df.drop_duplicates(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID", "Stop_ID"],inplace=True)

In [93]:
# Organise the Data
df.sort_values(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID", 'Timestamp'], ascending=True, inplace=True)

# Clean up index
df.reset_index(drop=True,inplace=True)

In [94]:
# Check size of dataframe
df.shape

(747672, 14)

In [95]:
df.dtypes

Timestamp             datetime64[ns]
Journey_Pattern_ID          category
Time_Frame            datetime64[ns]
Vehicle_Journey_ID            object
Bus_Operator                category
Longitude                    float64
Latitude                     float64
Delay_seconds                  int32
Block_ID                    category
Vehicle_ID                  category
Stop_ID                     category
At_Stop                        int32
Week_Day                       int32
Distance                     float64
dtype: object

## Add Time Taken Feature

In [96]:
@numba.jit(nogil=True)
def time_taken_feature(df):

    # List to hold feature
    df["TravelTime"] = np.nan
    
    
    # Set up values on first iteration
    last_id = df.loc[0, "Vehicle_Journey_ID"]
    start_time = df.loc[0, "Timestamp"]
    
    #set very first distance
    df.set_value(0, "TravelTime", 0.0)

    for row in itertools.islice(df.itertuples(),1,None):

        current_time = row[1]
        current_id = row[4]

        # If it's a new Journey ID
        if current_id != last_id:
            last_id = row[4]        
            start_time = row[1]        
            df.set_value(row[0], "TravelTime", 0.0)
            continue
        
        df.set_value(row[0], "TravelTime", abs((current_time - start_time).total_seconds()) )  
        last_time = row[1]
        last_id = row[4]

In [97]:
%timeit time_taken_feature(df)

1 loop, best of 3: 26.7 s per loop


## Add Time Category Feature

In [98]:
df["TimeCategory"] = pd.DatetimeIndex(df['Timestamp']).round('30min')  

In [99]:
df['Time_hour'] = df['Timestamp'].values.astype('<M8[h]')

In [100]:
COLTYPES = {
    
    "Time_hour" : 'str',
    "Rain" : 'float32',
    "Temp" : 'float32',
    "Windspeed" : 'float32'
           }

In [101]:
df2 = pd.read_csv('WeatherData_cleaned.csv', dtype=COLTYPES, parse_dates=[0])

In [102]:
df = pd.merge(df,df2, how='inner', on='Time_hour')

In [103]:
df.drop('Time_hour', axis=1, inplace=True)

In [104]:
hdf_path = 'cleaned_store.h5'

In [105]:
# Convert data type category to object before writing the hdf file, reason is to avoid conflicts
for column in categorical_columns:
    df[column] = df[column].astype('object')

In [106]:
df.to_hdf(hdf_path, 'table_name', mode='a', format='table', append=True, data_columns=True,complevel=9, complib='blosc')

In [107]:
df = pd.read_hdf("cleaned_store.h5", key="table_name", where='Week_Day == 6')

In [108]:
df.head()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Bus_Operator,Longitude,Latitude,Delay_seconds,Block_ID,Vehicle_ID,Stop_ID,At_Stop,Week_Day,Distance,TravelTime,TimeCategory,Rain,Temp,windSpeed
0,2012-11-11 10:55:24,2380001,2012-11-11,14326,HN,-6.3782,53.419167,20,238002,24587,7073,1,6,0.0,0.0,2012-11-11 11:00:00,0.0,4.9,5.67
1,2012-11-11 10:57:22,2380001,2012-11-11,14326,HN,-6.38474,53.420254,106,238002,24587,7097,1,6,0.523641,118.0,2012-11-11 11:00:00,0.0,4.9,5.67
2,2012-11-11 10:59:42,2380001,2012-11-11,14326,HN,-6.393175,53.420818,201,238002,24587,7099,0,6,1.123999,258.0,2012-11-11 11:00:00,0.0,4.9,5.67
3,2012-11-11 10:00:45,1220001,2012-11-11,14241,PO,-6.265253,53.334057,44,12203,33096,1285,0,6,8.605123,1761.0,2012-11-11 10:00:00,0.0,4.9,5.67
4,2012-11-11 10:03:43,1220001,2012-11-11,14241,PO,-6.265603,53.332573,128,12203,33096,1286,1,6,9.069071,1939.0,2012-11-11 10:00:00,0.0,4.9,5.67


In [109]:
df.shape

(747672, 19)

In [110]:
df.tail()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Bus_Operator,Longitude,Latitude,Delay_seconds,Block_ID,Vehicle_ID,Stop_ID,At_Stop,Week_Day,Distance,TravelTime,TimeCategory,Rain,Temp,windSpeed
747667,2013-01-28 00:01:46,420001,2013-01-27,16787,CF,-6.131581,53.435257,-345,42008,43070,3615,1,6,19.711021,1898.0,2013-01-28 00:00:00,0.0,2.6,11.3
747668,2013-01-28 00:05:06,420001,2013-01-27,16787,CF,-6.134617,53.422699,-174,42008,43070,3616,1,6,21.699932,2098.0,2013-01-28 00:00:00,0.0,2.6,11.3
747669,2013-01-28 01:00:10,651003,2013-01-27,16846,RD,-6.263841,53.343029,-485,65006,38006,1357,0,6,30.195194,2506.0,2013-01-28 01:00:00,0.0,2.7,9.41
747670,2013-01-28 01:01:31,651003,2013-01-27,16846,RD,-6.261706,53.344307,-424,65006,38006,1358,0,6,30.459973,2587.0,2013-01-28 01:00:00,0.0,2.7,9.41
747671,2013-01-28 01:19:06,651003,2013-01-27,16846,RD,-6.246666,53.344067,448,65006,38006,5189,1,6,31.925939,3642.0,2013-01-28 01:30:00,0.0,2.7,9.41
