# Get 5 min weather attributes for Feb 2018 from RWIS_atmos and RWIS_surface

    Created by: Apoorba Bibeka
    Date: April 11, 2019

In [1]:
import getpass
import boto3
import pandas as pd
from impala.dbapi import connect
from impala.util import as_pandas
from io import BytesIO as StringIO
import numpy as np

## Connect to the database

In [2]:
username = "abibeka@securedatacommons.com"
password1 = getpass.getpass()
conn = connect(host="172.18.1.20",auth_mechanism='PLAIN',port=10000,user=username, password=password1)

········


In [3]:
cursor = conn.cursor()

In [4]:
# wydot_rwis_atmos has bad naming convention. local is a keyword in HIVE
#but is also used as a variable name. Thus use `` for accessing local.

In [5]:
cursor = conn.cursor()
cursor.execute('''select deviceid from wydot_speed_sensors_index
               where milepost between 314 and 360''')
# Store the index data tanle
SSindex = as_pandas(cursor)
SSindexL=SSindex.deviceid.tolist()
print(SSindexL)

[383L, 393L, 394L, 395L, 396L, 384L, 385L, 1839L, 386L, 2146L, 2147L, 2178L, 2191L, 387L, 2202L, 2213L, 2246L, 2263L, 2274L, 2289L, 2298L, 2310L, 2319L, 388L, 389L, 3654L, 390L, 3911L, 391L]


## Find the corresponding primary and secondary weather stations for speed sensor stations

In [6]:
q1='''SELECT deviceid, nearest_rwis, rwis, backup_rwis
        from wydot_speed_sensors_index'''
cursor.execute(q1)
dat=as_pandas(cursor)
dat1=dat[dat['deviceid'].isin(SSindexL)].copy()
rwis_sta=dat1.nearest_rwis.tolist()+dat1.backup_rwis.tolist()
rwis_sta = [int(x.strip('R00')) for x in rwis_sta]
print(rwis_sta,len(np.unique(rwis_sta)))

([1354, 36, 1354, 1354, 1354, 1354, 1354, 3451, 3422, 1354, 1354, 36, 3422, 343, 3422, 3422, 1366, 1366, 3443, 3443, 3443, 3443, 3451, 343, 343, 1354, 1366, 1354, 1366, 36, 3422, 36, 36, 36, 36, 36, 3443, 343, 36, 36, 3422, 343, 3422, 343, 343, 343, 3443, 1366, 1366, 3451, 3451, 3443, 3422, 1366, 36, 343, 36, 3443], 7)


In [7]:
dat1

Unnamed: 0,deviceid,nearest_rwis,rwis,backup_rwis
1,383,R001354,WY28,R000360
2,393,R000360,KVDW,R003422
3,394,R001354,WY28,R000360
4,395,R001354,WY28,R000360
5,396,R001354,WY28,R000360
12,384,R001354,WY28,R000360
23,385,R001354,WY28,R000360
33,1839,R003451,KVDW,R003443
34,386,R003422,KVDW,R003430
41,2146,R001354,WY28,R000360


## Remove the timezone ('-6:00' or '-7:00') from the end.
## Remove 'T' from between the date and time.

Figure out what to do with precip_type

In [14]:
timefilter="FROM_UNIXTIME(CEILING(UNIX_TIMESTAMP(regexp_replace(regexp_replace(`local`,'-0[67]:00$',''),'T',' '))/300)*300)"
start_date='2018-02-01'
end_date='2018-02-28'
q1='''Select deviceid AS rwisid, AVG(airtemp) AS AvgAirTemp, AVG(windspeed_avg) AS AvgWindSpd, 
    AVG(windspeed_gust) AS AvgWindGust, AVG(winddir_avg) AS AvgWindDir, 
    COLLECT_SET(precip_intensity) AS SetPrecpInts, AVG(precip_rate) AS AvgPrecpRt,
    AVG(precip_accumulation) AS AvgPrecpAccm, AVG(visibilityft) AS AvgVisFt,
    COLLECT_SET(precip_type) AS SetPrecpTyp, 
    {} AS Time5M
    FROM wydot_rwis_atmos 
    WHERE deviceid IN {} AND {} BETWEEN '{}' AND '{}'
    GROUP BY deviceid, {}'''.format(timefilter,tuple(rwis_sta),timefilter,start_date, end_date,timefilter)
cursor.execute(q1)
Xdat=as_pandas(cursor)

In [7]:
#Xdat.head(5)
#print(dat.shape)

In [16]:
dat2=dat1.loc[:,['deviceid','backup_rwis']].copy()
dat2.backup_rwis=dat2.backup_rwis.str.strip('^R00')
dat2=dat2.rename(index=str,columns={'deviceid':'SecStat'})

dat3=dat1.loc[:,['deviceid','nearest_rwis']].copy()
dat3.nearest_rwis=dat3.nearest_rwis.str.strip('^R00')
dat3=dat3.rename(index=str,columns={'deviceid':'PrimStat'})


In [17]:
dat3

Unnamed: 0,PrimStat,nearest_rwis
1,383,1354
2,393,36
3,394,1354
4,395,1354
5,396,1354
12,384,1354
23,385,1354
33,1839,3451
34,386,3422
41,2146,1354


In [18]:
dat2

Unnamed: 0,SecStat,backup_rwis
1,383,36
2,393,3422
3,394,36
4,395,36
5,396,36
12,384,36
23,385,36
33,1839,3443
34,386,343
41,2146,36


In [20]:
Xdat.rwisid=Xdat.rwisid.astype(int)
dat3.nearest_rwis=dat3.nearest_rwis.astype(int)
#finDat = pd.merge(dat3,Xdat,left_on='nearest_rwis',right_on='RWISid',how='left')
#finDat.shape

In [21]:
csv_buffer=StringIO()
Xdat.to_csv(csv_buffer,sep=",",index=False)
s3=boto3.resource('s3')
s3.Object('prod-sdc-tti-911061262852-us-east-1-bucket','abibeka/Data/Feb2018HiveRWIS_AtosDat.csv').put(Body=csv_buffer.getvalue())

{u'ETag': '"11377f9488b64fd3462770e71fdd0ae9"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Thu, 11 Apr 2019 20:46:08 GMT',
   'etag': '"11377f9488b64fd3462770e71fdd0ae9"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'F6EjDLCLviBUi88wGFqThaJPdu/q80dhuKeFDgsob8J4v+mwHfq+AI1BIxpLtx4z8PR6VOaa1ZY=',
   'x-amz-request-id': '66DD2917A5368C8D'},
  'HTTPStatusCode': 200,
  'HostId': 'F6EjDLCLviBUi88wGFqThaJPdu/q80dhuKeFDgsob8J4v+mwHfq+AI1BIxpLtx4z8PR6VOaa1ZY=',
  'RequestId': '66DD2917A5368C8D',
  'RetryAttempts': 0}}

In [22]:
csv_buffer=StringIO()
dat3.to_csv(csv_buffer,sep=",",index=False)
s3=boto3.resource('s3')
s3.Object('prod-sdc-tti-911061262852-us-east-1-bucket','abibeka/Data/PrimRWIS_SpdID_Map.csv').put(Body=csv_buffer.getvalue())

{u'ETag': '"27c39bcc539e2f96ca15b43767808e1d"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Thu, 11 Apr 2019 20:49:43 GMT',
   'etag': '"27c39bcc539e2f96ca15b43767808e1d"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'aBG/cHo8fx+UgdGYDi9JQQLIADYnRad4DVzNElGiraYFyWs3ix3YRltCh/jHfi+C/4kRgj2xg5A=',
   'x-amz-request-id': '11EBBAB3A65A1EB5'},
  'HTTPStatusCode': 200,
  'HostId': 'aBG/cHo8fx+UgdGYDi9JQQLIADYnRad4DVzNElGiraYFyWs3ix3YRltCh/jHfi+C/4kRgj2xg5A=',
  'RequestId': '11EBBAB3A65A1EB5',
  'RetryAttempts': 0}}

In [23]:
csv_buffer=StringIO()
dat2.to_csv(csv_buffer,sep=",",index=False)
s3=boto3.resource('s3')
s3.Object('prod-sdc-tti-911061262852-us-east-1-bucket','abibeka/Data/SecndRWIS_SpdID_Map.csv').put(Body=csv_buffer.getvalue())

{u'ETag': '"e218d8d8c3b54472e4c66113cbba9e6b"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Thu, 11 Apr 2019 20:49:44 GMT',
   'etag': '"e218d8d8c3b54472e4c66113cbba9e6b"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'GPOJubvRg+4ADu8L+tAghFmTrQKd8qbj+XMUpBevfNltPZAwH3G5jpR2Z4tt10nuztFSvHN+1G8=',
   'x-amz-request-id': 'E0A145ED2F96906F'},
  'HTTPStatusCode': 200,
  'HostId': 'GPOJubvRg+4ADu8L+tAghFmTrQKd8qbj+XMUpBevfNltPZAwH3G5jpR2Z4tt10nuztFSvHN+1G8=',
  'RequestId': 'E0A145ED2F96906F',
  'RetryAttempts': 0}}

## Understand the data 

In [None]:
start_date='2018-02-01'
end_date='2018-02-28'
q1='''SELECT deviceid, precip_intensity, precip_type, winddir_avg, `local` as DateTime, sensorid 
    FROM wydot_rwis_atmos 
    LIMIT 10'''
cursor.execute(q1)
volDebug=as_pandas(cursor)
print(volDebug)

In [None]:
q1='''describe wydot_rwis_atmos'''
cursor.execute(q1)
print(cursor.fetchall())

# RWIS Surface Data

In [9]:
timefilter="FROM_UNIXTIME(CEILING(UNIX_TIMESTAMP(regexp_replace(regexp_replace(`local`,'-0[67]:00$',''),'T',' '))/300)*300)"
start_date='2018-02-01'
end_date='2018-02-28'
q1='''Select deviceid AS rwisid, AVG(chem_pct) AS AvgChemPct,
    COLLECT_SET(surface_status) AS SetSurfStat, 
    {} AS Time5M
    FROM wydot_rwis_surface 
    WHERE deviceid IN {} AND {} BETWEEN '{}' AND '{}'
    GROUP BY deviceid, {}'''.format(timefilter,tuple(rwis_sta),timefilter,start_date, end_date,timefilter)
cursor.execute(q1)
SurfDat=as_pandas(cursor)

In [10]:
SurfDat.head()

Unnamed: 0,rwisid,avgchempct,setsurfstat,time5m
0,1354,-99999.0,"[""Trace Moisture""]",2018-02-01 00:00:00
1,1354,-99999.0,"[""Trace Moisture""]",2018-02-01 00:05:00
2,1354,-99999.0,"[""Trace Moisture""]",2018-02-01 00:10:00
3,1354,-99999.0,"[""Trace Moisture""]",2018-02-01 00:15:00
4,1354,-99999.0,"[""Trace Moisture""]",2018-02-01 00:20:00


In [11]:
csv_buffer=StringIO()
SurfDat.to_csv(csv_buffer,sep=",",index=False)
s3=boto3.resource('s3')
s3.Object('prod-sdc-tti-911061262852-us-east-1-bucket','abibeka/Data/Feb2018HiveRWIS_SurfDat.csv').put(Body=csv_buffer.getvalue())

{u'ETag': '"801ebead160696a3af03e18137756f5b"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Fri, 12 Apr 2019 19:21:57 GMT',
   'etag': '"801ebead160696a3af03e18137756f5b"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'P86z7PYdAOrIvISFgbTcBftWTaRk7Ax2PPJhFY8nsJ0xb9gMuGBJfbFyIDg22QuaqeGwnSDoPbw=',
   'x-amz-request-id': 'ED3B878D45C2A8A7'},
  'HTTPStatusCode': 200,
  'HostId': 'P86z7PYdAOrIvISFgbTcBftWTaRk7Ax2PPJhFY8nsJ0xb9gMuGBJfbFyIDg22QuaqeGwnSDoPbw=',
  'RequestId': 'ED3B878D45C2A8A7',
  'RetryAttempts': 0}}