## Reading in Bus Data

In [1]:
import pandas as pd
import os, sys


#reading in data
file_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(file_dir)
newPath = os.path.join(parent_dir, 'bus_lines/bus_66_trips.csv')


dftrips = pd.read_csv(newPath,  delimiter=',', index_col=False)


In [2]:
dftrips.tail(5)

Unnamed: 0,DayOfService,TripID,LineID,RouteID,Direction,PlannedTime_Dep,PlannedTime_Arr,ActualTime_Dep,ActualTime_Arr
11555,2017-06-30,5116922,66,66_11,1,42585,38700,43195.0,38696.0
11556,2017-06-30,5116923,66,66_13,2,46991,43500,47769.0,43386.0
11557,2017-06-30,5116930,66,66_11,1,76631,72900,76452.0,72894.0
11558,2017-06-30,5116931,66,66_13,2,80863,78300,81322.0,78260.0
11559,2017-06-30,5117541,66,66_13,2,61658,57900,62376.0,57881.0


In [3]:
#reading in full historic weather data
file_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(file_dir)
newPath = os.path.join(parent_dir, 'bus_lines/bus_66_lt.csv')


dfleavetimes = pd.read_csv(newPath, delimiter=',', index_col=False)

In [4]:
dfleavetimes.head(5)

Unnamed: 0,DayOfService,TripID,ProgrNumber,StopPointID,PlannedTime_Arr,PlannedTime_Dep,ActualTime_Arr,ActualTime_Dep,Duration
0,2017-01-03,4084264,19,2201,65540,65540,64669,64669,64669
1,2017-01-03,4084264,24,2215,65870,65870,65054,65054,385
2,2017-01-03,4084264,29,2219,66160,66160,65354,65364,310
3,2017-01-03,4084264,34,3887,66429,66429,65631,65631,277
4,2017-01-03,4084264,39,3955,66721,66721,65886,65886,255


In [5]:
dfleavetimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349291 entries, 0 to 349290
Data columns (total 9 columns):
DayOfService       349291 non-null object
TripID             349291 non-null int64
ProgrNumber        349291 non-null int64
StopPointID        349291 non-null int64
PlannedTime_Arr    349291 non-null int64
PlannedTime_Dep    349291 non-null int64
ActualTime_Arr     349291 non-null int64
ActualTime_Dep     349291 non-null int64
Duration           349291 non-null int64
dtypes: int64(8), object(1)
memory usage: 24.0+ MB


### Merging dataframes

trips dataframe is joined onto leavetimes dataframe where tripid and dayofservice match. They are merged using an left join. 

In [6]:
df = pd.merge(dfleavetimes, dftrips, how='left', on=['TripID', 'DayOfService'], suffixes=('_leave', '_trips'))

In [7]:
print("Trips shape:")
dftrips.shape

Trips shape:


(11560, 9)

In [8]:
print("Leavetimes shape:")
dfleavetimes.shape

Leavetimes shape:


(349291, 9)

In [9]:
df.shape

(349291, 16)

## Reading in Weather Data

In [10]:
file_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(file_dir)
newPath = os.path.join(parent_dir, '/home/student/data/weather/2017weatherClean.csv')


dfW17 = pd.read_csv(newPath, delimiter=',')

In [11]:
#Only keeping the columns that we want
dfW17  = dfW17[['dt','temp','pressure','humidity','wind_speed','weather_description','clouds_all','wind_deg']]

In [12]:
#We want to look at only one direction
df = df.loc[(df['Direction'] == 1)]

In [13]:
#combining DayOfService and TripID into one column 
df['comb'] = df['DayOfService'].map(str) + df['TripID'].map(str)
u = df['comb'].unique()

In [14]:
len(u)

3656

In [15]:
import numpy as np
trip_66 = df['TripID'].unique()
date_66 = df['DayOfService'].unique()

result = pd.DataFrame(columns = ['dt','dayofweek','month','day','arrive_time','start_point','end_point','duration'])


# for trip in trip_46A:
#     for date in date_46A:
for date_trip in u:
        day_trip = df.loc[(df['comb'] == date_trip)].sort_values(by=['ProgrNumber'])
        try:
            day_trip.index = range(len(day_trip))

            # Calculate the time between each pair stops
            time = day_trip['ActualTime_Arr_leave'].shift(-1) - day_trip['ActualTime_Arr_leave']
            time.drop(time.tail(1).index,inplace=True)

            # Get the month and the day, month and the day of week
            month = pd.to_datetime(day_trip['DayOfService']).dt.month
            day = pd.to_datetime(day_trip['DayOfService']).dt.day
            dayofweek = pd.to_datetime(day_trip['DayOfService']).dt.dayofweek

            # Add datetime and actual_arrive time as current unix time
            datetime = pd.DatetimeIndex(day_trip['DayOfService']).astype(np.int64)/1000000000 + day_trip['ActualTime_Arr_leave']
            datetime = pd.DataFrame(data={'unixtime':datetime})

            # Conver float to int, this is for the following merge operation
            datetime['unixtime'] = datetime['unixtime'].astype(np.int64)

            # Set End point 
            end_point = day_trip['StopPointID'].shift(-1)
            end_point.drop(end_point.tail(1).index,inplace=True)

            # Merge these columns
            a = pd.concat([datetime,dayofweek, month, day, day_trip['ActualTime_Arr_leave'], day_trip['StopPointID'], end_point, time], axis = 1)
            # Change the name of columns
            a.columns = ['dt','dayofweek','month','day','arrive_time','start_point','end_point','duration']
            a.drop(a.tail(1).index,inplace=True)
            # Merge two tables
            r = pd.merge_asof(a, dfW17, on = "dt",direction='nearest')

            result = pd.concat([r, result])
            
        
        except:
            pass
        
# result

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [16]:
result.shape

(180589, 15)

In [17]:
result.head(10)

Unnamed: 0,arrive_time,clouds_all,day,dayofweek,dt,duration,end_point,humidity,month,pressure,start_point,temp,weather_description,wind_deg,wind_speed
0,72894,75.0,30,4,1498853694,64.0,493.0,76.0,6,1016.0,7391,286.72,broken clouds,320.0,6.0
1,72958,75.0,30,4,1498853758,55.0,494.0,76.0,6,1016.0,493,286.72,broken clouds,320.0,6.0
2,73013,75.0,30,4,1498853813,97.0,495.0,76.0,6,1016.0,494,286.72,broken clouds,320.0,6.0
3,73110,75.0,30,4,1498853910,124.0,400.0,76.0,6,1016.0,495,286.72,broken clouds,320.0,6.0
4,73234,75.0,30,4,1498854034,26.0,346.0,76.0,6,1016.0,400,286.72,broken clouds,320.0,6.0
5,73260,75.0,30,4,1498854060,210.0,317.0,76.0,6,1016.0,346,286.72,broken clouds,320.0,6.0
6,73470,75.0,30,4,1498854270,123.0,312.0,76.0,6,1016.0,317,286.72,broken clouds,320.0,6.0
7,73593,75.0,30,4,1498854393,81.0,1444.0,76.0,6,1016.0,312,286.72,broken clouds,320.0,6.0
8,73674,75.0,30,4,1498854474,99.0,1445.0,76.0,6,1016.0,1444,286.72,broken clouds,320.0,6.0
9,73773,75.0,30,4,1498854573,183.0,7078.0,76.0,6,1016.0,1445,286.72,broken clouds,320.0,6.0


In [18]:
df.to_csv('result_of_66-dir1.csv')

In [19]:
#converting weather description into binary to use in regression model.
features = [ 'arrive_time','clouds_all','day','dayofweek','end_point','humidity',
           'month','pressure','start_point','temp','wind_deg','wind_speed']
binary_weather = pd.get_dummies(result['weather_description'])
df = result[features]
df = pd.concat([result, binary_weather], axis=1)


In [20]:
df.to_csv('result_of_66-dir1Model.csv')

In [21]:
#getting seconds to hh:mm
#df['arrive_time'] = pd.to_datetime(df['arrive_time'], unit='m')

In [22]:
#removing date
#df['arrive_time'] = pd.Series([val.time() for val in df['arrive_time']])