# Script to get the volume data by sensor lane
Created by: Apoorba

Date: March 27, 2019

In [1]:
import boto3 #Need to read and write files to aws s3 bucket
import pandas as pd
from io import BytesIO as StringIO 
import numpy as np

    Note: For Python 2.7 we imported BytesIO as StringIO
    Don't need to do this for Python 3.x
    Just use import StringIO as StringIO

## Read the 5 min aggregate data 

The data is obtained from "Vol_dat.csv" file. We got this file from "Vol-Lamarie-Cheyenne-V1" script. 

In [2]:
#Data I got from a Report
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key="abibeka/Vol_dat_Ln_Cls.csv")
dat=pd.read_csv(obj['Body'])
dat.head()

Unnamed: 0,t1.controller,t1.lane,t1.vehclass,t1.time5m,t1.nrec,t2.direction,t2.milepost,t2.2015_adt
0,384,1,1,2018-02-01 00:05:00,1,I,325.8,13890
1,384,1,2,2018-02-01 00:05:00,1,I,325.8,13890
2,384,1,3,2018-02-01 00:05:00,4,I,325.8,13890
3,384,3,3,2018-02-01 00:05:00,3,I,325.8,13890
4,384,4,1,2018-02-01 00:05:00,1,I,325.8,13890


## Read the index file for the speed sensors.
Following is the index file for speed sensors; "SpeedSen_MP_314_360.csv". Got it from one of the WyDOT report.

In [3]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key="abibeka/uploaded_files/SpeedSen_MP_314_360.csv")
AB_SS=pd.read_csv(obj['Body'])
AB_SS.head()

Unnamed: 0,DEVICEID,SITENAME,MP,Sensor_Loc
0,396,Laramie East,317.68,EB
1,3911,Telephone Canyon,320.7,WB
2,395,Summit,322.05,WB
3,2146,Summit,322.6,WB
4,2147,Summit,323.85,EB


## Process 5 min aggregate data
Remove the "tx." from the column name. Also only retain the relevant columns.

In [4]:
Rename1 ={x:x.split('.')[1] for x in dat}
dat=dat.rename(index=str,columns=Rename1)
dat =dat[['controller','time5m','nrec','lane','vehclass']]
dat.head()

Unnamed: 0,controller,time5m,nrec,lane,vehclass
0,384,2018-02-01 00:05:00,1,1,1
1,384,2018-02-01 00:05:00,1,1,2
2,384,2018-02-01 00:05:00,4,1,3
3,384,2018-02-01 00:05:00,3,3,3
4,384,2018-02-01 00:05:00,1,4,1


In [5]:
dat_e=pd.pivot_table(dat,index=['controller','time5m'],values='nrec',columns=['lane'],aggfunc=np.sum,fill_value=0)

In [6]:
dat_e1=dat_e.copy()
dat_e1=dat_e.columns.name=None
dat_e1=dat_e.reset_index()
dat_e1=dat_e1.rename(columns={1:"Lane1",2:"Lane2",3:"Lane3",4:"Lane4"})

In [7]:
dat_e1.head()

Unnamed: 0,controller,time5m,Lane1,Lane2,Lane3,Lane4
0,384,2018-02-01 00:05:00,6,0,3,11
1,384,2018-02-01 00:10:00,6,0,0,3
2,384,2018-02-01 00:15:00,6,1,1,9
3,384,2018-02-01 00:20:00,9,3,0,5
4,384,2018-02-01 00:25:00,3,1,0,6


## Create continous date indices
Basically fill out observation where volume was 0.

In [8]:
dev=dat_e1['controller'].unique()
# Get index for time 
dat_e1['DateTime']=pd.to_datetime(dat_e1['time5m'],format='%Y-%m-%d %H:%M:%S')
# Set index
dat_e1=dat_e1.set_index(['controller','DateTime']) 
# Create time interval index without missing values
index = pd.date_range('2018-02-01','2018-02-28',closed='right',freq='5min')
# Create a multiindex
multi_index=pd.MultiIndex.from_product([dev,index],names=['deviceId','DateTime'])
# Reindex. Now the missing values have nan. 
dat_e1=dat_e1.reindex(multi_index)

In [9]:
dat_e1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,time5m,Lane1,Lane2,Lane3,Lane4
deviceId,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
384,2018-02-01 00:05:00,2018-02-01 00:05:00,6.0,0.0,3.0,11.0
384,2018-02-01 00:10:00,2018-02-01 00:10:00,6.0,0.0,0.0,3.0
384,2018-02-01 00:15:00,2018-02-01 00:15:00,6.0,1.0,1.0,9.0
384,2018-02-01 00:20:00,2018-02-01 00:20:00,9.0,3.0,0.0,5.0
384,2018-02-01 00:25:00,2018-02-01 00:25:00,3.0,1.0,0.0,6.0


## Reset the index and merge data with index table

In [10]:
# Add information about the speed sensor by merging with the index table
# First make dat1 a single index table then merge with AB_SS table and then change the new table to multi index. 
dat_e1= dat_e1.reset_index().merge(AB_SS,how="left",left_on=['deviceId'],right_on="DEVICEID").set_index(dat_e1.index.names)
# Drop unnecessary columns
dat_e1=dat_e1.drop(columns=['time5m','DEVICEID'])
# Replace nan wiht 0 in indices with missing values. 
dat_e1=dat_e1.fillna(0)
# Correct data type
dat_e1.Lane1=dat_e1.Lane1.astype(int)
dat_e1.Lane2=dat_e1.Lane2.astype(int)
dat_e1.Lane3=dat_e1.Lane3.astype(int)
dat_e1.Lane4=dat_e1.Lane4.astype(int)
# Remae columns
dat_e1=dat_e1.rename(columns={'MP':"Milepost","Sensor_Loc":"Direction"})

In [11]:
dat_e1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Lane1,Lane2,Lane3,Lane4,SITENAME,Milepost,Direction
deviceId,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
384,2018-02-01 00:05:00,6,0,3,11,Summit East,325.8,WB
384,2018-02-01 00:10:00,6,0,0,3,Summit East,325.8,WB
384,2018-02-01 00:15:00,6,1,1,9,Summit East,325.8,WB
384,2018-02-01 00:20:00,9,3,0,5,Summit East,325.8,WB
384,2018-02-01 00:25:00,3,1,0,6,Summit East,325.8,WB


In [12]:
dat_e1.loc[384].head()

Unnamed: 0_level_0,Lane1,Lane2,Lane3,Lane4,SITENAME,Milepost,Direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-02-01 00:05:00,6,0,3,11,Summit East,325.8,WB
2018-02-01 00:10:00,6,0,0,3,Summit East,325.8,WB
2018-02-01 00:15:00,6,1,1,9,Summit East,325.8,WB
2018-02-01 00:20:00,9,3,0,5,Summit East,325.8,WB
2018-02-01 00:25:00,3,1,0,6,Summit East,325.8,WB


In [13]:
#Can untack the columns to reshaped the data
#dat_e1.unstack(1).head()

## Write the final output in Z drive

In [14]:
dat_e1.to_csv('Z:/Apoorb/Process_dat_Ln.csv')

## Write the final output to a folder in bucket

In [15]:
csv_buffer=StringIO()
dat_e1.to_csv(csv_buffer,sep=",",index=True)
s3=boto3.resource('s3')
s3.Object('prod-sdc-tti-911061262852-us-east-1-bucket','abibeka/Data/ProcessVolDat_Ln.csv').put(Body=csv_buffer.getvalue())

{u'ETag': '"bfeb4c6e04d8d36c8f869b2a335d5aa1"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Wed, 27 Mar 2019 22:29:34 GMT',
   'etag': '"bfeb4c6e04d8d36c8f869b2a335d5aa1"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'Dq8d2UH0F82Cz7jRJkgMTKoh2FnCrKir9jPeNyMnWdtUp9zEzT7DpzqawgngiIEzVKlJpEwMt0E=',
   'x-amz-request-id': '0882E399B1AEAE6A'},
  'HTTPStatusCode': 200,
  'HostId': 'Dq8d2UH0F82Cz7jRJkgMTKoh2FnCrKir9jPeNyMnWdtUp9zEzT7DpzqawgngiIEzVKlJpEwMt0E=',
  'RequestId': '0882E399B1AEAE6A',
  'RetryAttempts': 0}}

## Read the file created above

In [16]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/ProcessVolDat_Ln.csv')
dat_test=pd.read_csv(obj['Body'])
dat_test.head()

Unnamed: 0,deviceId,DateTime,Lane1,Lane2,Lane3,Lane4,SITENAME,Milepost,Direction
0,384,2018-02-01 00:05:00,6,0,3,11,Summit East,325.8,WB
1,384,2018-02-01 00:10:00,6,0,0,3,Summit East,325.8,WB
2,384,2018-02-01 00:15:00,6,1,1,9,Summit East,325.8,WB
3,384,2018-02-01 00:20:00,9,3,0,5,Summit East,325.8,WB
4,384,2018-02-01 00:25:00,3,1,0,6,Summit East,325.8,WB
