# ----------------------------------------------------------------------------------------------------------
# AWS DATA INGESTION
# ----------------------------------------------------------------------------------------------------------

___

<a> <img src='img\architecture_v2.png' width="1000" /></a>
___

### Import libraries
Make sure you install each of module / library 

* pip install 'module name' on your command prompt

In [1]:
import boto3
import io 
import pandas as pd
import json
import time
import datetime
import numpy as np
import s3fs
import awswrangler
from datetime import datetime
from datetime import timedelta


## 1. Excel / Flat Files Data Sources

### Below is the architecture plan:

___

<a> <img src='img\xl_ingest_pic_v2.png' width="1000" /></a>
___

In [47]:
#define bucket address where your data is placed to be read
bucket_name = 'mst-lab-data'
object_key = 'input/user_007/rawtest2.xlsx'
# s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret) #not secured way consider using AWS CLI
s3 = boto3.client('s3') #to call library which enable we talk to aws environment (S3)
obj = s3.get_object(Bucket=bucket_name, Key=object_key) #
data = obj['Body'].read()
df = pd.read_excel(io.BytesIO(data), encoding='utf-8')
df.head()

Unnamed: 0,date,site,prod
0,2019-01-01,x,200
1,2019-01-02,x,100


### How to generate timestamp for current event
It'll be usefull for us if we want to get the current time's event

In [48]:
dateTimeObj = datetime.now()
print(dateTimeObj)

2019-08-28 09:32:38.883170


### Change it into string so we can use it as the name of data if we want

In [49]:
yr= str(dateTimeObj.year)
mo= str(dateTimeObj.month)
day= str(dateTimeObj.day)
hr= str(dateTimeObj.hour)
mn= str(dateTimeObj.minute)
sc= str(dateTimeObj.second)
up_filename=yr+mo+day+hr+mn+sc+'.csv'
up_filename

'201982893238.csv'

### Add the uploadername and the timestamp they upload

In [50]:
df['timestamp']=dateTimeObj
df['useropluad']=object_key
df

Unnamed: 0,date,site,prod,timestamp,useropluad
0,2019-01-01,x,200,2019-08-28 09:32:38.883170,input/user_007/rawtest2.xlsx
1,2019-01-02,x,100,2019-08-28 09:32:38.883170,input/user_007/rawtest2.xlsx


### Upload the dataframe into S3 bucket using boto3

In [51]:
# define target bucket to load the data
target_bucket='mst-lab-data'
target_object='output/'+up_filename
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(target_bucket,target_object ).put(Body=csv_buffer.getvalue());

### Upload the dataframe into S3 bucket using s3fs library

In [17]:
bytes_to_write = df.to_csv(None,index=False ).encode()
fs = s3fs.S3FileSystem()
with fs.open('s3://blueprint-group-rawdata/subgroup_1-rawdata/robert_raw1_csv/'+up_filename, 'wb') as f:
    f.write(bytes_to_write)

## Check blank columns and blank rows

Re-read the data

In [52]:
#define bucket address where your data is placed to be read
bucket_name = 'mst-lab-data'
object_key = 'input/user_007/rawtest3.xlsx'
# s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret) #not secured way consider using AWS CLI
s3 = boto3.client('s3') #to call library which enable we talk to aws environment (S3)
obj = s3.get_object(Bucket=bucket_name, Key=object_key) #
data = obj['Body'].read()
datarawcheck = pd.read_excel(io.BytesIO(data), encoding='utf-8')
datarawcheck.head()

Unnamed: 0,date,site,Unnamed: 2,prod
0,2019-01-01,x,,200.0
1,2019-01-02,x,,100.0
2,NaT,,,
3,2019-01-03,x,,400.0


Blank column checking

In [88]:
datarawcheck.columns = range(datarawcheck.shape[1])
cols_skip=[]
# pd.isna(datarawcheck[2])
for i in range(len(datarawcheck.columns)):
        if (((pd.isna(datarawcheck[i])).nunique()) == 1 ) & ((((pd.isna(datarawcheck[i])).unique())[0])==True):
            cols_skip.append(i)
        else:
            break
cols_skip


[]

Get non blank columns

In [89]:
cols = [i for i in range(len(datarawcheck.columns)) if i not in cols_skip]
cols

[0, 1, 2, 3]

Blank row checking

In [90]:
rows_skip=[]
for i in range(len(datarawcheck.iloc[i])):
    if (((pd.isna(datarawcheck.iloc[i])).nunique()) == 1 ) & ((((pd.isna(datarawcheck.iloc[i])).unique())[0])==True):
        rows_skip.append(i)
    else:
        break
rows_skip

[]

In [91]:
rows=len(rows_skip)
rows

0

### add date and time features using strftime ( http://strftime.org/ )

In [93]:
df['Year']= pd.to_datetime(df['date']).dt.strftime("%Y")
df['Month']= pd.to_datetime(df['date']).dt.strftime("%m")
# df['Month']=pd.to_numeric(df['Month'])
df['Day']= pd.to_datetime(df['date']).dt.strftime("%d")

# df['Day']=pd.to_numeric(df['Day'])
df.head()

Unnamed: 0,date,site,prod,timestamp,useropluad,Year,Month,Day
0,2019-01-01,x,200,2019-08-28 09:32:38.883170,input/user_007/rawtest2.xlsx,2019,1,1
1,2019-01-02,x,100,2019-08-28 09:32:38.883170,input/user_007/rawtest2.xlsx,2019,1,2


## 2. Sharepoint List Data Sources

___

<a> <img src='img\sharepoint_ingest_pic_v2.png' width="1000" /></a>
___

## 3. SQL DB Data Sources

___

<a> <img src='img\sqldb_ingest_pic_v2.png' width="1000" /></a>
___