# Script to get the volume data by VehClass
Created by: Apoorba

Date: March 27, 2019

Vehicle class:
0: Unknown
1: <20 ft.
2: 20-40 ft.
3: >40 ft.

In [1]:
import boto3 #Need to read and write files to aws s3 bucket
import pandas as pd
from io import BytesIO as StringIO 
import numpy as np

    Note: For Python 2.7 we imported BytesIO as StringIO
    Don't need to do this for Python 3.x
    Just use import StringIO as StringIO

## Read the 5 min aggregate data 

The data is obtained from "Vol_dat.csv" file. We got this file from "Vol-Lamarie-Cheyenne-V1" script. 

In [2]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key="abibeka/Data/Feb2018HiveVolDat_Ln_Cls.csv")
dat=pd.read_csv(obj['Body'])
dat.head()

Unnamed: 0,t1.controller,t1.lane,t1.vehclass,t1.time5m,t1.speedmph_avg,t1.nrec,t2.direction,t2.milepost,t2.2015_adt
0,384,1,1,2018-02-01 00:05:00,,1,I,325.8,13890
1,384,1,2,2018-02-01 00:05:00,,1,I,325.8,13890
2,384,1,3,2018-02-01 00:05:00,63.550001,4,I,325.8,13890
3,384,3,3,2018-02-01 00:05:00,65.703334,3,I,325.8,13890
4,384,4,1,2018-02-01 00:05:00,65.650002,1,I,325.8,13890


## Process 5 min aggregate data
Remove the "tx." from the column name. Also only retain the relevant columns.

In [3]:
{x:x.split('.')[1] for x in dat}

{'t1.controller': 'controller',
 't1.lane': 'lane',
 't1.nrec': 'nrec',
 't1.speedmph_avg': 'speedmph_avg',
 't1.time5m': 'time5m',
 't1.vehclass': 'vehclass',
 't2.2015_adt': '2015_adt',
 't2.direction': 'direction',
 't2.milepost': 'milepost'}

In [4]:
Rename1 ={x:x.split('.')[1] for x in dat}
dat=dat.rename(index=str,columns=Rename1)
dat =dat[['controller','time5m','nrec','speedmph_avg','lane','vehclass','direction','milepost']]
dat=dat.rename(index=str, columns={'nrec':'Vol','speedmph_avg':'AvgSpd'})
dat.head()

Unnamed: 0,controller,time5m,Vol,AvgSpd,lane,vehclass,direction,milepost
0,384,2018-02-01 00:05:00,1,,1,1,I,325.8
1,384,2018-02-01 00:05:00,1,,1,2,I,325.8
2,384,2018-02-01 00:05:00,4,63.550001,1,3,I,325.8
3,384,2018-02-01 00:05:00,3,65.703334,3,3,I,325.8
4,384,2018-02-01 00:05:00,1,65.650002,4,1,I,325.8


In [5]:
dat_e=pd.pivot_table(dat,index=['controller','time5m'],values=['Vol','AvgSpd'],columns=['vehclass'],aggfunc={'Vol':np.sum,'AvgSpd':np.mean})

In [6]:
dat_e1=dat_e.copy()
dat_e1=dat_e.reset_index()
dat_e1=dat_e1.rename(columns={0:"Class0",1:"Class1",2:"Class2",3:"Class3"})

In [7]:
dat_e1.head()

Unnamed: 0_level_0,controller,time5m,AvgSpd,AvgSpd,AvgSpd,AvgSpd,Vol,Vol,Vol,Vol
vehclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Class0,Class1,Class2,Class3,Class0,Class1,Class2,Class3
0,384,2018-02-01 00:05:00,,65.650002,60.34,62.452963,,2.0,2.0,16.0
1,384,2018-02-01 00:10:00,,,63.280001,60.666666,,,2.0,7.0
2,384,2018-02-01 00:15:00,,70.397497,65.567501,60.1805,,3.0,3.0,11.0
3,384,2018-02-01 00:20:00,,,64.876668,62.1025,,,4.0,13.0
4,384,2018-02-01 00:25:00,,68.420002,,59.529167,,2.0,,8.0


## Create continous date indices
Basically fill out observation where volume was 0.

In [8]:
dev=dat_e1['controller'].unique()
# Get index for time 
dat_e1['DateTime']=pd.to_datetime(dat_e1['time5m'],format='%Y-%m-%d %H:%M:%S')
dat_e1=dat_e1.rename(str,columns={'controller':'deviceId'})
# Set index
dat_e1=dat_e1.set_index(['deviceId','DateTime']) 
# Create time interval index without missing values
index = pd.date_range('2018-02-01','2018-02-28',closed='right',freq='5min')
# Create a multiindex
multi_index=pd.MultiIndex.from_product([dev,index],names=['deviceId','DateTime'])
# Reindex. Now the missing values have nan. 
dat_e1=dat_e1.reindex(multi_index)

In [9]:
#index.head(20)

In [10]:
dat_e1.columns=[''.join(col).strip() for col in dat_e1.columns.values]

## Reset the index and merge data with index table

In [11]:
AB_SS=dat.loc[:,['controller','milepost','direction']].drop_duplicates()
AB_SS

Unnamed: 0,controller,milepost,direction
0,384,325.8,I
54828,385,326.9,B
122887,386,334.5,B
189898,387,336.1,B
256906,388,336.5,B
322027,389,338.1,B
386036,390,340.5,B
440839,391,343.8,B
509104,396,317.68,B
575744,1839,356.7,B


In [12]:
# Add information about the speed sensor by merging with the index table
# First make dat1 a single index table then merge with AB_SS table and then change the new table to multi index. 
dat_e1= dat_e1.reset_index().merge(AB_SS,how="left",left_on=['deviceId'],right_on="controller").set_index(dat_e1.index.names)
# Drop unnecessary columns
dat_e1=dat_e1.drop(columns=['time5m','controller'])
# Replace nan wiht 0 in indices with missing values. 
dat_e1=dat_e1.fillna(0)
# Correct data type
dat_e1.VolClass0=dat_e1.VolClass0.astype(int)
dat_e1.VolClass1=dat_e1.VolClass1.astype(int)
dat_e1.VolClass2=dat_e1.VolClass2.astype(int)
dat_e1.VolClass3=dat_e1.VolClass3.astype(int)
# Remae columns
dat_e1=dat_e1.rename(columns={'milepost':"Milepost","direction":"Direction"})

In [13]:
dat_e1=dat_e1.reset_index()
dat_e1.head()

Unnamed: 0,deviceId,DateTime,AvgSpdClass0,AvgSpdClass1,AvgSpdClass2,AvgSpdClass3,VolClass0,VolClass1,VolClass2,VolClass3,Milepost,Direction
0,384,2018-02-01 00:05:00,0.0,65.650002,60.34,62.452963,0,2,2,16,325.8,I
1,384,2018-02-01 00:10:00,0.0,0.0,63.280001,60.666666,0,0,2,7,325.8,I
2,384,2018-02-01 00:15:00,0.0,70.397497,65.567501,60.1805,0,3,3,11,325.8,I
3,384,2018-02-01 00:20:00,0.0,0.0,64.876668,62.1025,0,0,4,13,325.8,I
4,384,2018-02-01 00:25:00,0.0,68.420002,0.0,59.529167,0,2,0,8,325.8,I


In [14]:
dat_e1.head()

Unnamed: 0,deviceId,DateTime,AvgSpdClass0,AvgSpdClass1,AvgSpdClass2,AvgSpdClass3,VolClass0,VolClass1,VolClass2,VolClass3,Milepost,Direction
0,384,2018-02-01 00:05:00,0.0,65.650002,60.34,62.452963,0,2,2,16,325.8,I
1,384,2018-02-01 00:10:00,0.0,0.0,63.280001,60.666666,0,0,2,7,325.8,I
2,384,2018-02-01 00:15:00,0.0,70.397497,65.567501,60.1805,0,3,3,11,325.8,I
3,384,2018-02-01 00:20:00,0.0,0.0,64.876668,62.1025,0,0,4,13,325.8,I
4,384,2018-02-01 00:25:00,0.0,68.420002,0.0,59.529167,0,2,0,8,325.8,I


In [15]:
dat_e1.loc[:,'TotVeh']=dat_e1.VolClass0+dat_e1.VolClass1+dat_e1.VolClass2+dat_e1.VolClass3
dat_e1.loc[:,'PerHGV']=(dat_e1.VolClass3*100/dat_e1.TotVeh).round(2)
#Don't do it. Missing values will mess up the average.
#dat_e1.loc[:,'AvgSpd']=((dat_e1.AvgSpdClass0+dat_e1.AvgSpdClass1+dat_e1.AvgSpdClass2+dat_e1.AvgSpdClass3)/4).round(2)


  result = com._values_from_object(self).round(decimals)


## Write the final output in Z drive

In [16]:
#dat_e1.to_csv('Z:/Apoorb/Data/Feb2018ProcessVolDatClass.csv')

## Write the final output to a folder in bucket

In [17]:
csv_buffer=StringIO()
dat_e1.to_csv(csv_buffer,sep=",",index=True)
s3=boto3.resource('s3')
s3.Object('prod-sdc-tti-911061262852-us-east-1-bucket','abibeka/Data/Feb2018ProcessVolDatClass.csv').put(Body=csv_buffer.getvalue())

{u'ETag': '"a45a22bf893e038c16af8c037d0c7d20"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Thu, 25 Apr 2019 19:21:15 GMT',
   'etag': '"a45a22bf893e038c16af8c037d0c7d20"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'ZPIDYPANSVRF0Ci/OQj43PlAq1dIzHlj2cNxEUcoIqOOIoHEtVd2t5cKXn0b0BC4sBXVQ5MWXLs=',
   'x-amz-request-id': 'D9743690213A35CF'},
  'HTTPStatusCode': 200,
  'HostId': 'ZPIDYPANSVRF0Ci/OQj43PlAq1dIzHlj2cNxEUcoIqOOIoHEtVd2t5cKXn0b0BC4sBXVQ5MWXLs=',
  'RequestId': 'D9743690213A35CF',
  'RetryAttempts': 0}}

## Read the file created above

In [18]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/Feb2018ProcessVolDatClass.csv')
dat_test=pd.read_csv(obj['Body'])
dat_test.head()

Unnamed: 0.1,Unnamed: 0,deviceId,DateTime,AvgSpdClass0,AvgSpdClass1,AvgSpdClass2,AvgSpdClass3,VolClass0,VolClass1,VolClass2,VolClass3,Milepost,Direction,TotVeh,PerHGV
0,0,384,2018-02-01 00:05:00,0.0,65.650002,60.34,62.452963,0,2,2,16,325.8,I,20,80.0
1,1,384,2018-02-01 00:10:00,0.0,0.0,63.280001,60.666666,0,0,2,7,325.8,I,9,77.78
2,2,384,2018-02-01 00:15:00,0.0,70.397497,65.567501,60.1805,0,3,3,11,325.8,I,17,64.71
3,3,384,2018-02-01 00:20:00,0.0,0.0,64.876668,62.1025,0,0,4,13,325.8,I,17,76.47
4,4,384,2018-02-01 00:25:00,0.0,68.420002,0.0,59.529167,0,2,0,8,325.8,I,10,80.0
