# Load your AWS Credentials.
You dont need to do this if an instance role is specified on the machine running the bodo cluster nodes.
Simply add a policy that will allow the attached instance role to access your s3 bucket with data.

In [1]:
%%px
import os
os.environ["AWS_DEFAULT_REGION"]="us-east-2"
os.environ["AWS_ACCESS_KEY_ID"]=""
os.environ["AWS_SECRET_ACCESS_KEY"]=""

Starting 192 engines with <class 'ipyparallel.cluster.launcher.MPIEngineSetLauncher'>


  0%|          | 0/192 [00:00<?, ?engine/s]

# Load our timeseries sensor data
We will load the timeseries sensor data from s3. The data is in the following format.

|Note| ts| pt | pt_value |
|---| --- | --- | --- |
|Explanation| Time stamp when value was measured | Tag for which value was measured | Measure value |
|Type| timestamp/date | string | float |
|Sample| 2021-10-22 10:00:00 | HYZARDOP | .517 |

Data is stored in S3, in parquet format partitioned by year,month and day columns

As part of the load, we will load data from s3 for year=2021 and we will filter out the day,month and year columns.

In [3]:
%%px

import bodo
import pandas as pd
import time
import datetime

input_path = 's3://bodo-customer-poc-data/koch/sync-rg12m/'

@bodo.jit(cache=True)
def load_data(path):
    start=time.time()
    df = pd.read_parquet(path,_bodo_read_as_dict=['ts','pt','pt_value'])
    df = df[(df['year']==2021)]# & 
    df=df[['ts','pt','pt_value']]
    return df
    
raw = load_data(input_path)



%px:   0%|          | 0/192 [00:00<?, ?tasks/s]

# Lets inspect the raw data quickly below

In [4]:
%%px --targets 0
raw

Unnamed: 0,ts,pt,pt_value
0,2021-10-01 00:00:00,41PJSO2KRV,-2.2333
1,2021-10-01 00:00:00,6SK1WJ6936,-2.2333
2,2021-10-01 00:00:00,L06YQDPV68,-2.2333
3,2021-10-01 00:00:00,R5J2TAZO3Y,-2.2333
4,2021-10-01 00:00:00,AR5IL5GUI8,-2.2333
...,...,...,...
34499995,2021-10-01 11:29:00,BPGH6NQ7SK,1.3453
34499996,2021-10-01 11:29:00,RZFH2BZS6B,1.3453
34499997,2021-10-01 11:29:00,5JFJEJHQZ1,1.3453
34499998,2021-10-01 11:29:00,9LGD7KT3SI,1.3453


# Bodo can handle large number of columns.

Lets get a count of unique 'pt' values for our dataset. You will see that there are 50000 unique 'pt' values. 
This will result in an output dataframe with 50000 columns after a pivot. 
This capability to easily handle such wide datasets is unique to bodo.

In [5]:
%%px
@bodo.jit
def get_unique_tags(df):
    out= df.pt.unique()
    print(len(out))
    return out
tags=get_unique_tags(raw)

%px:   0%|          | 0/192 [00:00<?, ?tasks/s]

[stdout:0] 50000


# A Quick look at the tags.

In [6]:
%%px --targets 0
tags

[0;31mOut[0:6]: [0m
array(['E2OSQXN6PM', '3S943O7ZDT', 'F0675HVTBI', ..., 'GFJDJ988FL',
       'IB6NI0L8X7', 'JLKKAJET8G'], dtype=object)

# Now lets do a quick pivot.
you will see bodo conforms to the pandas syntax for pivot and is very easy to implement.

In [7]:
%%px
@bodo.jit(cache=True)
def pivot_data(df):
    data=df.pivot_table(index="ts",columns="pt",values="pt_value",aggfunc="min")
    return data
pivot_df=pivot_data(raw)

%px:   0%|          | 0/192 [00:00<?, ?tasks/s]

# Lets inspect our pivoted dataset.

Notice each rank has exactly 50000 columns on the dataframe, Bodo ensure the schema for the dataframe is consistent across ranks.

In [8]:
%%px --targets 0,1,2,3
pivot_df.head()

pt,APO45KXO61,TCNOFXHCWZ,HMF1LZAIDY,MCRG0TRLJC,EJ9KN9CIT0,NGF4UBP8D8,55F8GDTKUX,44XGLJ9H0P,UTI26AT530,RADU66HNOQ,...,P0JOCZW8JX,0VVRDNT7DO,WR2F9U0QOG,H8PYGRNMP4,WOLIWJP5S4,A7WDMJNQT7,S7Q76WEZ8Y,NKNX4ZVQ1D,RBH7PZQEY2,JLKKAJET8G
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 05:23:00,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,...,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533,0.8533
2021-10-01 09:11:00,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,...,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069,-0.4069
2021-10-01 15:33:00,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,...,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875,-1.3875
2021-10-01 15:52:00,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,...,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309,-1.0309
2021-10-01 17:15:00,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,...,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066,-0.9066


pt,APO45KXO61,TCNOFXHCWZ,HMF1LZAIDY,MCRG0TRLJC,EJ9KN9CIT0,NGF4UBP8D8,55F8GDTKUX,44XGLJ9H0P,UTI26AT530,RADU66HNOQ,...,P0JOCZW8JX,0VVRDNT7DO,WR2F9U0QOG,H8PYGRNMP4,WOLIWJP5S4,A7WDMJNQT7,S7Q76WEZ8Y,NKNX4ZVQ1D,RBH7PZQEY2,JLKKAJET8G
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 00:49:00,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,...,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645,-1.2645
2021-10-01 03:15:00,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,...,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912,1.3912
2021-10-01 03:24:00,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,...,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181,-0.4181
2021-10-01 04:30:00,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,...,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319,-2.9319
2021-10-01 06:40:00,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,...,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698,-1.1698


pt,APO45KXO61,TCNOFXHCWZ,HMF1LZAIDY,MCRG0TRLJC,EJ9KN9CIT0,NGF4UBP8D8,55F8GDTKUX,44XGLJ9H0P,UTI26AT530,RADU66HNOQ,...,P0JOCZW8JX,0VVRDNT7DO,WR2F9U0QOG,H8PYGRNMP4,WOLIWJP5S4,A7WDMJNQT7,S7Q76WEZ8Y,NKNX4ZVQ1D,RBH7PZQEY2,JLKKAJET8G
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 03:59:00,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,...,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597,0.2597
2021-10-01 07:48:00,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,...,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462,-1.8462
2021-10-01 09:47:00,0.995,0.995,0.995,0.995,0.995,0.995,0.995,0.995,0.995,0.995,...,0.995,0.995,0.995,0.995,0.995,0.995,0.995,0.995,0.995,0.995
2021-10-01 17:55:00,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,...,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058,2.9058
2021-10-01 21:36:00,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,...,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454,-0.2454


pt,APO45KXO61,TCNOFXHCWZ,HMF1LZAIDY,MCRG0TRLJC,EJ9KN9CIT0,NGF4UBP8D8,55F8GDTKUX,44XGLJ9H0P,UTI26AT530,RADU66HNOQ,...,P0JOCZW8JX,0VVRDNT7DO,WR2F9U0QOG,H8PYGRNMP4,WOLIWJP5S4,A7WDMJNQT7,S7Q76WEZ8Y,NKNX4ZVQ1D,RBH7PZQEY2,JLKKAJET8G
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 02:04:00,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,...,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337,-0.9337
2021-10-01 06:35:00,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,...,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026,-0.9026
2021-10-01 08:38:00,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,...,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759,-1.6759
2021-10-01 09:54:00,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,...,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747,-2.8747
2021-10-01 16:45:00,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,...,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697,-0.697


# Conclusion
Bodo makes it super easy to work with wide datasets ana performs complex operation like pivot.

Now you have a pivoted dataset that can be used for other processes like ML , anamoly detection, market signals ( stock data) etc.