# Merge the Volume and Speed Data with the Weather Data

    Created by: Apoorba Bibeka
    Date: 04/12/2019

In [1]:
import boto3 #Need to read and write files to aws s3 bucket
import pandas as pd
from io import BytesIO as StringIO 
import numpy as np

## Load the RWIS ATMOS Weather Data

In [2]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/Feb2018HiveRWIS_AtosDat.csv')
RWISdat=pd.read_csv(obj['Body'])
RWISdat.head()

Unnamed: 0,rwisid,avgairtemp,avgwindspd,avgwindgust,avgwinddir,setwinddir,setprecpints,avgprecprt,avgprecpaccm,avgvisft,setprecptyp,time5m
0,1354,16.5,11.0,17.0,250.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",2018-02-01 00:00:00
1,1354,16.0,9.0,12.0,254.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",2018-02-01 00:05:00
2,1354,15.3,6.0,10.0,260.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",2018-02-01 00:10:00
3,1354,14.7,6.0,9.0,261.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",2018-02-01 00:15:00
4,1354,14.5,7.0,9.0,256.0,"[""W"",""W""]","[""Slight"",""Slight""]",0.0,-99999.0,6560.0,"[""Snow"",""Snow""]",2018-02-01 00:20:00


## Load RWIS Surface Data

In [3]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/Feb2018HiveRWIS_SurfDat.csv')
Surfdat=pd.read_csv(obj['Body'])
Surfdat.head()

Unnamed: 0,rwisid,avgchempct,setsurfstat,time5m
0,1354,-99999.0,"[""Trace Moisture"",""Trace Moisture""]",2018-02-01 00:00:00
1,1354,-99999.0,"[""Trace Moisture"",""Trace Moisture""]",2018-02-01 00:05:00
2,1354,-99999.0,"[""Trace Moisture"",""Trace Moisture""]",2018-02-01 00:10:00
3,1354,-99999.0,"[""Trace Moisture"",""Trace Moisture""]",2018-02-01 00:15:00
4,1354,-99999.0,"[""Trace Moisture"",""Trace Moisture""]",2018-02-01 00:20:00


In [4]:
RWISdat1=pd.merge(RWISdat,Surfdat,on=['rwisid','time5m'],how='outer')

## Load the Volume Data

In [5]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/Feb2018ProcessVolDatClass.csv')
VolDat=pd.read_csv(obj['Body'])
VolDat=VolDat.drop(columns='Unnamed: 0')
VolDat.head()

Unnamed: 0,deviceId,DateTime,AvgSpdClass0,AvgSpdClass1,AvgSpdClass2,AvgSpdClass3,VolClass0,VolClass1,VolClass2,VolClass3,Milepost,Direction,TotVeh,PerHGV
0,384,2018-02-01 00:05:00,0.0,65.650002,60.34,62.452963,0,2,2,16,325.8,I,20,80.0
1,384,2018-02-01 00:10:00,0.0,0.0,63.280001,60.666666,0,0,2,7,325.8,I,9,77.78
2,384,2018-02-01 00:15:00,0.0,70.397497,65.567501,60.1805,0,3,3,11,325.8,I,17,64.71
3,384,2018-02-01 00:20:00,0.0,0.0,64.876668,62.1025,0,0,4,13,325.8,I,17,76.47
4,384,2018-02-01 00:25:00,0.0,68.420002,0.0,59.529167,0,2,0,8,325.8,I,10,80.0


In [6]:
VolDat.columns

Index([u'deviceId', u'DateTime', u'AvgSpdClass0', u'AvgSpdClass1',
       u'AvgSpdClass2', u'AvgSpdClass3', u'VolClass0', u'VolClass1',
       u'VolClass2', u'VolClass3', u'Milepost', u'Direction', u'TotVeh',
       u'PerHGV'],
      dtype='object')

## Load the mapping table for Backup Weather Station and speed sensor station

In [7]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/SecndRWIS_SpdID_Map.csv')
SecKeyMap=pd.read_csv(obj['Body'])
SecKeyMap.head()

Unnamed: 0,SecStat,backup_rwis
0,383,36
1,393,3422
2,394,36
3,395,36
4,396,36


## Load the mapping table for Primary Weather Station and speed sensor station

In [8]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/PrimRWIS_SpdID_Map.csv')
PrimKeyMap=pd.read_csv(obj['Body'])
PrimKeyMap.head()

Unnamed: 0,PrimStat,nearest_rwis
0,383,1354
1,393,36
2,394,1354
3,395,1354
4,396,1354


In [9]:
# Get index for time 
RWISdat1['DateTime']=pd.to_datetime(RWISdat1['time5m'],format='%Y-%m-%d %H:%M:%S')
dev=RWISdat1.rwisid.unique().tolist()
RWISdat1=RWISdat1.rename(str,columns={'rwisid':'RWISid'})
# Set index
RWISdat1=RWISdat1.set_index(['RWISid','DateTime']) 
# Create time interval index without missing values
index = pd.date_range('2018-02-01','2018-02-28',closed='right',freq='5min')
# Create a multiindex
multi_index=pd.MultiIndex.from_product([dev,index],names=['RWISid','DateTime'])
# Reindex. Now the missing values have nan. 
RWISdat1=RWISdat1.reindex(multi_index)
RWISdat1=RWISdat1.reset_index()

In [10]:
RWISdat1.head()

Unnamed: 0,RWISid,DateTime,avgairtemp,avgwindspd,avgwindgust,avgwinddir,setwinddir,setprecpints,avgprecprt,avgprecpaccm,avgvisft,setprecptyp,time5m,avgchempct,setsurfstat
0,1354,2018-02-01 00:05:00,16.0,9.0,12.0,254.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",2018-02-01 00:05:00,-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
1,1354,2018-02-01 00:10:00,15.3,6.0,10.0,260.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",2018-02-01 00:10:00,-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
2,1354,2018-02-01 00:15:00,14.7,6.0,9.0,261.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",2018-02-01 00:15:00,-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
3,1354,2018-02-01 00:20:00,14.5,7.0,9.0,256.0,"[""W"",""W""]","[""Slight"",""Slight""]",0.0,-99999.0,6560.0,"[""Snow"",""Snow""]",2018-02-01 00:20:00,-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
4,1354,2018-02-01 00:25:00,14.4,6.0,9.0,250.0,"[""W"",""W""]","[""Slight"",""Slight""]",0.0,-99999.0,6000.0,"[""Snow"",""Snow""]",2018-02-01 00:25:00,-99999.0,"[""Trace Moisture"",""Trace Moisture""]"


In [11]:
VolDat['time5m']=VolDat['DateTime'].astype(str)
RWISdat1['time5m']=RWISdat1['DateTime'].astype(str)
RWISdat1.shape

(38880, 15)

In [12]:
FinDat1=pd.merge(VolDat,PrimKeyMap, left_on='deviceId', right_on='PrimStat',how='left')

In [13]:
FinDat2=pd.merge(FinDat1,RWISdat1,left_on=["nearest_rwis","time5m"],right_on=["RWISid","time5m"],how='inner')

In [14]:
FinDat2.shape

(38880, 31)

In [15]:
FinDat2 =FinDat2.drop(columns='DateTime_y')
FinDat1.shape

(38880, 17)

In [16]:
FinDat2 =FinDat2.rename(index=str,columns={'DateTime_x':'DateTime'})
FinDat2 =FinDat2.set_index(['Milepost','DateTime'])
FinDat2=FinDat2.sort_index()
FinDat2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,deviceId,AvgSpdClass0,AvgSpdClass1,AvgSpdClass2,AvgSpdClass3,VolClass0,VolClass1,VolClass2,VolClass3,Direction,...,avgwindgust,avgwinddir,setwinddir,setprecpints,avgprecprt,avgprecpaccm,avgvisft,setprecptyp,avgchempct,setsurfstat
Milepost,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
318.5,2018-02-01 00:05:00,3654,0.0,0.0,56.379999,60.975999,0,0,2,5,D,...,12.0,254.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
318.5,2018-02-01 00:10:00,3654,0.0,57.177501,0.0,60.294286,0,4,0,7,D,...,10.0,260.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
318.5,2018-02-01 00:15:00,3654,0.0,0.0,0.0,60.965,0,0,0,8,D,...,9.0,261.0,"[""W"",""W""]","[""None"",""None""]",0.0,-99999.0,6560.0,"[""None"",""None""]",-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
318.5,2018-02-01 00:20:00,3654,0.0,65.204998,64.59,62.877502,0,4,2,3,D,...,9.0,256.0,"[""W"",""W""]","[""Slight"",""Slight""]",0.0,-99999.0,6560.0,"[""Snow"",""Snow""]",-99999.0,"[""Trace Moisture"",""Trace Moisture""]"
318.5,2018-02-01 00:25:00,3654,0.0,0.0,0.0,63.527501,0,0,0,4,D,...,9.0,250.0,"[""W"",""W""]","[""Slight"",""Slight""]",0.0,-99999.0,6000.0,"[""Snow"",""Snow""]",-99999.0,"[""Trace Moisture"",""Trace Moisture""]"


In [18]:
FinDat2=FinDat2.drop(columns=['time5m'])
R_Nm = {'avgairtemp':'AvgAirTemp','avgwindspd':'AvgWindSpd','avgwindgust':'AvgWindGust',
        'avgwinddir':'AvgWindDir','setwinddir':'ListWindDir','setprecpints':'ListPrecpInts',
        'avgprecprt':'AvgPrecpRt','avgprecpaccm':'AvgPrecpAccm','avgvisft':'AvgVisFt'
        ,'setprecptyp':'ListPrecpTyp','avgchempct':'AvgChemPct','setsurfstat':'ListSurfStat'}
FinDat2=FinDat2.rename(index=str,columns=R_Nm)

In [None]:
FinDat2.to_csv('Z:/apoorb/MergeRwisVolDat.csv')

In [None]:
FinDat2=pd.read_csv('Z:/apoorb/MergeRwisVolDat.csv')

In [None]:
csv_buffer=StringIO()
FinDat2.to_csv(csv_buffer,sep=",",index=True)
s3=boto3.resource('s3')
s3.Object('prod-sdc-tti-911061262852-us-east-1-bucket','abibeka/Data/MergeRwisVolDat.csv').put(Body=csv_buffer.getvalue())

In [None]:
client=boto3.client('s3')
obj=client.get_object(Bucket='prod-sdc-tti-911061262852-us-east-1-bucket',Key='abibeka/Data/MergeRwisVolDat.csv')
FinDat2=pd.read_csv(obj['Body'])
FinDat2=FinDat2.drop(columns='Unnamed: 0')
FinDat2.head(200)

In [None]:
print(FinDat2.columns)

In [None]:
FinDat2.head()