# Final Clean

In [1]:
import pandas as pd
import datetime
import numpy as np
import itertools
from geopy.distance import vincenty
from ipykernel import kernelapp as app
%load_ext Cython
import numba
import gc

In [2]:
df = pd.read_hdf("cleaning_store.h5", key="table_name", where='Week_Day == 0')

In [3]:
gc.collect()

200

In [4]:
df.reset_index(drop=True,inplace=True)

In [5]:
df.head()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Bus_Operator,Longitude,Latitude,Delay_seconds,Block_ID,Vehicle_ID,Stop_ID,Week_Day
0,2012-11-12 06:00:04,00271003,2012-11-12,4846,RD,-6.398682,53.288082,8,27002,33319,2629,0
1,2012-11-12 06:00:04,00151001,2012-11-12,5781,RD,-6.328866,53.271633,2,15001,33488,6282,0
2,2012-11-12 06:00:04,046A0007,2012-11-12,7484,D1,-6.252283,53.341732,0,831238,33349,767,0
3,2012-11-12 06:00:04,00070001,2012-11-12,6931,D1,-6.25845,53.357067,2,7017,43002,4962,0
4,2012-11-12 06:00:06,039A1001,2012-11-12,3662,PO,-6.391812,53.39415,49,39003,36058,4747,0


In [6]:
df.dtypes

Timestamp             datetime64[ns]
Journey_Pattern_ID            object
Time_Frame            datetime64[ns]
Vehicle_Journey_ID            object
Bus_Operator                  object
Longitude                    float64
Latitude                     float64
Delay_seconds                  int32
Block_ID                      object
Vehicle_ID                    object
Stop_ID                       object
Week_Day                       int64
dtype: object

#### Note About The Following Cell:
This is the way we are sorting the data. First the Timeframe is the most important since it holds a unique Vehicle Journey ID (or it should) for every journey each day. 
Ideally we would then sort by VehicleJourneyId but the reality is that the data is messy. In order to avoid two buses at opposite sides of the city causing issues with the dataset we will now sort by the vehicle ID.
Next the vehicle journey ID makes sense since we can then start to sort the entire dataset into individual journeys along a certain route.
Lastly timestamp is obvious.

We'll use this cell several times to keep the dataframe sorted and the index correct, as many of the loops in this notebook require this.

You should deduce from this that there will be several cases of a VehicleJourneyId being repeated. cleaning up this is part of the challenge.

In [7]:
# Organise the Data
#df.sort_values(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID', 'Timestamp'], ascending=True, inplace=True)

## Remove General Noise

In [8]:
""" Remove every VehicleJourneyId which is equal to or below 5 rows in length. Realistically any journey (even the short ones)
should be at least 5 rows of data in length, anything else is just noise. """


df = df.groupby(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID'], as_index=False, group_keys=False).filter(lambda x: len(x) > 5)
#df = pd.DataFrame(gb)

In [9]:
# Organise the Data
df.sort_values(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID', 'Timestamp'], ascending=True, inplace=True)

## Remove Extra JourneyPatternId's From VehicleJourneyId's

In [10]:
# Create group object to work with 
gb = df.groupby(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID'], as_index=False, group_keys=False)

In [11]:
@numba.jit
def delete_outlier_journeypatternid(group):
    """ Takes a pandas group object and iterates removing the least occuring JourneyPatterId in each.
    If there are more than two or the occurances of the two are equal, it ignores it. """
    
    grouped_values = group["Journey_Pattern_ID"].value_counts()
        
    # If there's two Journey Pattern ID's
    if len(grouped_values) == 2:
                
        # If the two journey pattern ID's occupy the same space then do nothing (likely it's a 'noise' journey)
        if grouped_values[0] != grouped_values[1]:
            
            real_id = grouped_values.index.tolist()[0] 
            group = group[group.Journey_Pattern_ID == real_id]
            
    return group

In [12]:
# Apply the mapping function to the dataset
df = gb.apply(delete_outlier_journeypatternid)
# 120 secs
# to 79 seconds with jit numba


In [13]:
# Create group object to work with 
gb = df.groupby(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID'], as_index=False, group_keys=False)

In [14]:
# Delete outliers (journey's with 3 id's and other noise inc. nulls)
df = gb.filter(lambda x: len(x["Journey_Pattern_ID"].unique()) == 1)

## Remove Vehicle Journey ID's With Two Occurances Each Day

These can either be...
* Two buses completing the same route together with a stopover.
* Incomplete journey's.
* General noise in the data.

In any case they occupy less than 1% of the data and change week to week, so they can be dropped without overall loss of data integrity.

In [15]:
# Filter out these outliers

gb = df.groupby(["Time_Frame", "Vehicle_Journey_ID"], as_index=False, group_keys=False)

df = gb.filter(lambda x: len(x["Vehicle_ID"].unique()) == 1)

# Clean Up Start Of Journeys

In [16]:
# Organise the Data
df.sort_values(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID', 'Timestamp'], ascending=True, inplace=True)

# Clean up index
df.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
df.dtypes

Timestamp             datetime64[ns]
Journey_Pattern_ID            object
Time_Frame            datetime64[ns]
Vehicle_Journey_ID            object
Bus_Operator                  object
Longitude                    float64
Latitude                     float64
Delay_seconds                  int32
Block_ID                      object
Vehicle_ID                    object
Stop_ID                       object
Week_Day                       int64
dtype: object

In [18]:
@numba.jit
def GPS_clean(df):
    
    """ If the GPS coordinates remain the same for the first few rows of data, delete them. """
    
    last_bus_id = df.loc[0, "Vehicle_Journey_ID"]
    last_lat = df.loc[0, "Latitude"]
    last_long = df.loc[0, "Longitude"]
    new_start = True
    
    for row in itertools.islice(df.itertuples(),1,None):

        # For every iteration
        current_bus_id = row[4]
        current_lat = row[7]
        current_long = row[6]

        # If it's a different vehicle journey id
        if last_bus_id != current_bus_id:
            last_bus_id = row[4]
            last_lat = row[7]
            last_long = row[6]
            new_start = True
            continue

        # If it's the same journey
        if new_start:
            if current_lat == last_lat:
                if current_long == last_long:
                    # Flag
                    df.set_value(row[0] - 1, "Vehicle_Journey_ID", 0)
                    last_bus_id = row[4]
                    last_lat = row[7]
                    last_long = row[6]
                    continue

        # If it's the same journey but it's moved
        if last_bus_id == current_bus_id:
            if current_lat != last_lat or current_long != last_long:
                new_start = False
    return df

In [19]:
df = GPS_clean(df)

In [20]:
#iterrows, original loop: 400 seconds
#numba just in time compiler +  itertuples + skipping first row: 25 seconds

In [21]:
# Filter Out Rows Flagged
df = df[df.Vehicle_Journey_ID != 0]

In [22]:
# Organise the Data
df.sort_values(['Time_Frame', 'Vehicle_ID', 'Vehicle_Journey_ID', 'Timestamp'], ascending=True, inplace=True)
# Clean up index
df.reset_index(drop=True,inplace=True)

In [23]:
df.shape

(2484496, 12)

## Add Distance Feature

Before dropping duplicate StopId's we must first measure the distance on each route. This requires that we use all rows of data.

This will also make any VehicleJourneyId's which miss a stop along their journey useful data in the model.

In [24]:
@numba.jit
def get_distance(lat1, long1, lat2, long2):
    """ Get distance between two geo coordinates """
    
    stop1 = [lat1, long1]
    stop2 = [lat2, long2]
    
    return vincenty(stop1, stop2).meters

In [25]:
df.dtypes

Timestamp             datetime64[ns]
Journey_Pattern_ID            object
Time_Frame            datetime64[ns]
Vehicle_Journey_ID            object
Bus_Operator                  object
Longitude                    float64
Latitude                     float64
Delay_seconds                  int32
Block_ID                      object
Vehicle_ID                    object
Stop_ID                       object
Week_Day                       int64
dtype: object

In [None]:
@numba.jit
def add_distance_todf(df):

    # List to hold feature
    df["Distance"] = np.nan
    
    # Set up values on first iteration
    last_id = df.loc[0, "Vehicle_Journey_ID"]
    last_lat = df.loc[0, "Latitude"]
    last_long = df.loc[0, "Longitude"]
    last_distance = 0
    
    #set very first distance
    df.set_value(0, "Distance", 0)

    for row in itertools.islice(df.itertuples(),1,None):
        
        current_id = row[4]
        current_lat = row[7]
        current_long = row[6]
        current_distance = get_distance(current_lat, current_long, last_lat, last_long)

        # If it's a new Journey ID
        if current_id != last_id:
            last_lat = row[7]
            last_long = row[6]
            last_id = row[4]
            last_distance = 0

            df.set_value(row[0], "Distance", 0)
            continue

        # If it's not a new Journey ID
        current_distance = get_distance(current_lat, current_long, last_lat, last_long)
        
        last_distance = last_distance + current_distance
        
        df.set_value(row[0], "Distance", last_distance)
        last_lat = row[7]
        last_long = row[6]
        last_id = row[4]
        
    return df

In [None]:
df = add_distance_todf(df)

In [None]:
#original code: > 5 minutes, I stopped the loop and proceeded optimizing
# jit numba + itertuples + skip first row: 246 seconds

In [None]:
df.head()

## Remove Stop ID Duplicates

Now we can filter the dataframe a little by removing dupicate stopID's. Although it would be better to train on every row it might be too much to compute. 

This will also help us later in making the database stop distances. There is not enough information to map the exact distance to each stop, so we will have to take some kind of average of the AtStop == 0 columns and subtract a little to get a rough distance to each stop in a Journey Pattern ID.

Because this section keeps the first occurance of each StopId, this should be a very accurate way to estimate the distance to each stop on each route.

In [None]:
""" This will delete all duplicates AFTER the first example of each is found. So when the bus arrives at the stop, 
all subsequent rows at that stop will be deleted. """

df.drop_duplicates(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID", "Stop_ID"],inplace=True)

In [None]:
# Organise the Data
df.sort_values(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID", 'Timestamp'], ascending=True, inplace=True)

# Clean up index
df.reset_index(drop=True,inplace=True)

In [None]:
# Check size of dataframe
df.shape

In [None]:
df.dtypes

## Add Time Taken Feature

In [None]:
@numba.jit
def time_taken_feature(df):

    # List to hold feature
    df["TravelTime"] = np.nan
    
    
    # Set up values on first iteration
    last_id = df.loc[0, "Vehicle_Journey_ID"]
    start_time = df.loc[0, "Timestamp"]
    
    #set very first distance
    df.set_value(0, "TravelTime", 0.0)

    for row in itertools.islice(df.itertuples(),1,None):

        current_time = row[1]
        current_id = row[4]

        # If it's a new Journey ID
        if current_id != last_id:
            last_id = row[4]        
            start_time = row[1]        
            df.set_value(row[0], "TravelTime", 0.0)
            continue
        
        df.set_value(row[0], "TravelTime", abs((current_time - start_time).total_seconds()) )  
        last_time = row[1]
        last_id = row[4]
        
    return df

In [None]:
df = time_taken_feature(df)

## Add Time Category Feature

In [None]:
df["TimeCategory"] = pd.DatetimeIndex(df['Timestamp']).round('30min')  

# Final Clean (added clean for Block ID's)
## Section 1: Remove Extra Block ID's From VehicleJourneyId's

In [None]:
# Note df size for later
df.shape

In [None]:
df.dtypes

In [None]:
# Check Issue of how many Journey's have two Block ID's

@numba.jit
def journeyID_two_blockID(df):

    gb = df.groupby(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID"], as_index=False, group_keys=False)

    count = 0

    for item in gb:

        x = item[1]["Block_ID"].value_counts()

        if len(x) > 1:
            count += 1
        
    return df

        

In [None]:
df = journeyID_two_blockID(df)

# Section 2: If there's not a lot then delete them & Skip Section 3

In [None]:
# Create group object to work with 
gb = df.groupby(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID"], as_index=False, group_keys=False)

In [None]:
# Delete outliers (Block ID's with 3 id's and other noise inc. nulls)
df = gb.filter(lambda x: len(x["Block_ID"].unique()) == 1)

##  Now Delete All Journey's With 1 Row Of Data

In [None]:
# Create group object to work with 
gb = df.groupby(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID"], as_index=False, group_keys=False)

In [None]:
""" Remove every VehicleJourneyId which is equal to or below 5 rows in length. Realistically any journey (even the short ones)
should be at least 5 rows of data in length, anything else is just noise. """


gb = df.groupby(["Time_Frame", 'Vehicle_ID', "Vehicle_Journey_ID"], as_index=False, group_keys=False)

df = gb.filter(lambda x: len(x) > 1)

In [None]:
df['Time_hour'] = df['Timestamp'].values.astype('<M8[h]')

In [None]:
COLTYPES = {
    
    "Time_hour" : 'str',
    "Rain" : 'float32',
    "Temp" : 'float32',
    "Windspeed" : 'float32'
           }

In [None]:
df2 = pd.read_csv('WeatherData_cleaned.csv', dtype=COLTYPES, parse_dates=[0])

In [None]:
df = pd.merge(df,df2, how='inner', on='Time_hour')

In [None]:
df.drop('Time_hour', index=1, inplace=True)

In [None]:
hdf_path = 'cleaned_store.h5'

In [None]:
df.to_hdf(hdf_path, 'table_name', mode='a', format='table', append=True, data_columns=True,complevel=9, complib='blosc')