In [1]:
import pandas as pd
import datetime
from geopy.distance import vincenty

In [2]:
# Read in data with proper titles
df = pd.read_csv("January_Bus_8_to_10.csv", low_memory = False)
# df.columns = ["TimeStamp", "LineId", "JourneyPatternId", "TimeFrame", "VehicleJourneyId", "Congestion", "Long", "Lat", "Delay", "BlockId", "StopId"]

In [3]:
df.head()

Unnamed: 0,TimeStamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,Congestion,Long,Lat,Delay,BlockId,StopId,weekday
0,2012-11-08 00:00:07,102,1021001,2012-11-07,435,0,-6.212934,53.409649,-443,102001,4381,3
1,2012-11-08 00:02:37,123,1231001,2012-11-07,5191,0,-6.3252,53.324451,294,123012,6056,3
2,2012-11-08 00:00:03,13,130007,2012-11-07,4169,0,-6.389081,53.325191,538,13009,4663,3
3,2012-11-08 00:03:42,13,130007,2012-11-07,4169,0,-6.401314,53.317501,546,13009,4667,3
4,2012-11-08 00:05:43,13,130007,2012-11-07,4169,0,-6.412405,53.324871,491,13009,2136,3


In [4]:
df.shape

(387172, 12)

In [5]:
df['TimeStamp'] =  pd.to_datetime(df['TimeStamp'], format='%Y-%m-%d %H:%M:%S')
df["LineId"] = df["LineId"].astype("category")
df["JourneyPatternId"] = df["JourneyPatternId"].astype("category")
df["TimeFrame"] = df["TimeFrame"].astype("category")
df["VehicleJourneyId"] = df["VehicleJourneyId"].astype("category")
df["Congestion"] = df["Congestion"].astype("category")
df["BlockId"] = df["BlockId"].astype("category")
df["StopId"] = df["StopId"].astype("category")

In [6]:
df.dtypes

TimeStamp           datetime64[ns]
LineId                    category
JourneyPatternId          category
TimeFrame                 category
VehicleJourneyId          category
Congestion                category
Long                       float64
Lat                        float64
Delay                        int64
BlockId                   category
StopId                    category
weekday                      int64
dtype: object

In [7]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'LineId', 'JourneyPatternId', 'VehicleJourneyId', 'TimeStamp'], ascending=True)

In [8]:
# Clean up index
df = df.reset_index()
del df['index']

In [9]:
df['weekday'] = df['TimeStamp'].dt.dayofweek

In [10]:
df["JourneyPatternId"].value_counts().count()

461

In [11]:
df.head(10)

Unnamed: 0,TimeStamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,Congestion,Long,Lat,Delay,BlockId,StopId,weekday
0,2012-11-08 00:01:38,4,40001,2012-11-07,5331,0,-6.153467,53.288651,453,4001,7226,3
1,2012-11-08 00:00:07,4,41002,2012-11-07,5416,0,-6.265067,53.388851,-23,4012,281,3
2,2012-11-08 00:00:09,7,70001,2012-11-07,6963,0,-6.135499,53.292816,-113,7014,2040,3
3,2012-11-08 00:03:30,7,70001,2012-11-07,6963,0,-6.14274,53.292503,0,7014,2043,3
4,2012-11-08 00:08:50,7,70001,2012-11-07,6963,0,-6.139555,53.273537,-6,7014,3210,3
5,2012-11-08 00:09:48,7,70001,2012-11-07,6963,0,-6.138042,53.271317,-24,7014,3211,3
6,2012-11-08 00:12:10,7,70001,2012-11-07,6963,0,-6.139968,53.269024,55,7014,3212,3
7,2012-11-08 00:14:25,7,70001,2012-11-07,6963,0,-6.137609,53.26207,22,7014,3215,3
8,2012-11-08 00:01:38,7,70001,2012-11-07,6966,0,-6.129867,53.248032,-83,7009,4982,3
9,2012-11-08 00:00:09,7,71001,2012-11-07,6855,0,-6.233802,53.330879,2527,7001,487,3


## Now Add The Distance, Destination Stop, Time Taken

In [12]:
def get_distance(lat1, long1, lat2, long2):
    """ Get distance between two geo coordinates """
    
    stop1 = (lat1, long1)
    stop2 = (lat2, long2)
    
    return vincenty(stop1, stop2).meters

In [14]:
# List to hold features
travel_time = list()
distance = list()

for item, row in df.iterrows():
    
    # Set up values on first iteration
    if item == 0:
        last_lat = row[7]
        last_long = row[6]
        last_id = row[4]
        
        start_time = row[0]
        last_distance = 0
        
        travel_time.append(0.0)
        distance.append(0)
        continue
    
    current_id = row[4]
    current_lat = row[7]
    current_long = row[6]
    current_time = row[0]
    current_distance = get_distance(current_lat, current_long, last_lat, last_long)
    
    # If it's a new Journey ID
    if current_id != last_id:
        last_id = row[4]
        last_lat = row[7]
        last_long = row[6]
        
        start_time = row[0]
        last_distance = 0
        
        travel_time.append(0.0)
        distance.append(0)
        continue
        
    travel_time.append(abs((start_time - current_time).total_seconds()))
    
    current_distance = get_distance(current_lat, current_long, last_lat, last_long)
        
    distance.append(last_distance + current_distance)
    
    last_distance = distance[-1]
    last_time = row[0]
    last_lat = row[7]
    last_long = row[6]
    last_id = row[4]

In [15]:
len(travel_time)

387172

In [16]:
len(distance)

387172

In [17]:
df.shape

(387172, 12)

In [20]:
df["TravelTime"] = travel_time

In [21]:
df["Distance"] = distance

In [23]:
# Clean up index
df = df.reset_index()
del df['index']

In [1]:
df

NameError: name 'df' is not defined

In [27]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'LineId', 'JourneyPatternId', 'VehicleJourneyId', 'TimeStamp'], ascending=True)

In [30]:
test = df[df.VehicleJourneyId == 146]

In [31]:
test

Unnamed: 0,TimeStamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,Congestion,Long,Lat,Delay,BlockId,StopId,weekday,TravelTime,Distance
79203,2012-11-08 17:17:10,41,410001,2012-11-08,146,0,-6.25415,53.350376,185,41002,1171,3,0.0,0.0
79204,2012-11-08 17:19:48,41,410001,2012-11-08,146,0,-6.256622,53.353806,208,41002,1173,3,158.0,415.713536
79205,2012-11-08 17:21:10,41,410001,2012-11-08,146,0,-6.258642,53.35635,186,41002,1174,3,240.0,729.166802
79206,2012-11-08 17:24:28,41,410001,2012-11-08,146,0,-6.260883,53.360165,193,41002,15,3,438.0,1179.203514
79207,2012-11-08 17:27:47,41,410001,2012-11-08,146,0,-6.258309,53.362984,272,41002,17,3,637.0,1536.688322
79208,2012-11-08 17:36:21,41,410001,2012-11-08,146,0,-6.249843,53.376286,369,41002,203,3,1151.0,3120.734564
79209,2012-11-08 17:37:20,41,410001,2012-11-08,146,0,-6.246595,53.379551,369,41002,204,3,1210.0,3543.536997
79210,2012-11-08 17:38:22,41,410001,2012-11-08,146,0,-6.245199,53.381317,369,41002,205,3,1272.0,3760.929708
79211,2012-11-08 17:40:04,41,410001,2012-11-08,146,0,-6.244052,53.383217,419,41002,1620,3,1374.0,3985.74015
79212,2012-11-08 17:41:20,41,410001,2012-11-08,146,0,-6.244926,53.387997,378,41002,220,3,1450.0,4520.892265


In [None]:
df.to_csv("3_days_cleaned_sprint_2.csv")