# ----------------------------------------------------------------------------------------------------------
# AWS DATA INGESTION - Lambda Perspective
# ----------------------------------------------------------------------------------------------------------

### Import libraries
Make sure you install each of module / library 

* pip install 'module name' on your command prompt

In [3]:
import boto3
import io 
import pandas as pd
import json
import time
import datetime
import numpy as np
import s3fs
import awswrangler
from datetime import datetime
from datetime import timedelta


___

<a> <img src='img\xl_ingest_pic_v2.png' width="1000" /></a>
___

### Event JSON Structure
https://docs.aws.amazon.com/en_us/AmazonS3/latest/dev/notification-content-structure.html

## Lambda Handler Function

In [1]:
import json
from datetime import datetime
from rawprod_to_csv import readxls_rawprod_convertcsv

# import ptvsd

# ptvsd.enable_attach(address=('0.0.0.0',5890),redirect_output=True)
# ptvsd.wait_for_attach()

def lambda_handler(event, context):

    if event:
        dateTimeObj = datetime.now() + timedelta(hours=1) #to capture timestamp event lambda read event upload
        file_obj=event["Records"][0] #to get file objet information from event s3 json
        fileName=str(file_obj['s3']['object']['key']).replace("+"," ") #target filename object from json replace + string into space
        bucket = str(file_obj['s3']['bucket']['name']).replace("+"," ") #target bucketname object from json
    
        try:
            x=fileName
            if x=='rsw/uploadhere-user1/rawtest1.xlsx' :
                print(x)
                readxls_rawprod_convertcsv(bucket, fileName, dateTimeObj)
                print("go to code: readxls_rawprod_convertcsv is passed")
            else:
                print("the uploaded format file is not sufficient")

        except Exception as e:
            print(str(e))
            return {
            'statusCode': 200,
            'body': json.dumps(str(e))}


## Lambda Main Function

In [None]:
from io import StringIO
import pandas as pd
import time
import datetime
import numpy as np
import boto3
import io
import requests
import os
# import s3fs


def readxls_rawprod_convertcsv(bucket, fileName, dateTimeObj):
    #define data to read
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=fileName)
    file_obj = io.BytesIO(obj['Body'].read())

    #initial statement to store some paramters
    newdata = None
    cols_skip = []
    datarawcheck = None

    ###Check excel file
    try:
        datarawcheck = pd.read_excel(file_obj, sheet_name='input', header=None)
        print("bucket:" + bucket + " filename:" + fileName)
        print("Excel Openned")
    except Exception as e:
        print("Can't open the file. Please check the S3 bucket")
        print(str(e))
        return

    #checking is there any blank column we'll tag the position and skip them in next uploading
    for i in range(len(datarawcheck.columns)):
        if (((pd.isna(datarawcheck[i])).nunique()) == 1) & (((
            (pd.isna(datarawcheck[i])).unique())[0]) == True):
            cols_skip.append(i)
        else:
            continue

    #checking is there any blank rows for each rows we'll skip it
    rows_skip = []
    for i in range(len(datarawcheck.iloc[i])):
        if (((pd.isna(datarawcheck.iloc[i])).nunique()) == 1) & (((
            (pd.isna(datarawcheck.iloc[i])).unique())[0]) == True):
            rows_skip.append(i)
        else:
            continue

    # define unblank columns and rows
    cols = [i for i in range(len(datarawcheck.columns)) if i not in cols_skip]
    rows = rows_skip

    del datarawcheck

    newdata = pd.read_excel(file_obj,
                            sheet_name='input',
                            skiprows=rows,
                            usecols=cols)
    # newdata= pd.read_excel(file_obj, sheet_name='input')
    newdata.reset_index()
    newdata = newdata.dropna(subset=['site']) #to drop row data any blank on site info 
    newdata['date'] = pd.to_datetime(newdata['date']) #to make date as date format
    print(newdata.head()) #to be print on log cloudwatch when we check it
    if (newdata is not None or not newdata.empty):
        # newchecking_date = min(newdata['date']) #just to check if user upload file with previous date data
        # flag with timeupload and user upload
        a = str(dateTimeObj)
        b = fileName
        print(a)
        print(b)
        newdata['timesupload'] = a
        newdata['userupload'] = b

        # create new filename
        yr = str(dateTimeObj.year)
        mo = str(dateTimeObj.month)
        day = str(dateTimeObj.day)
        hr = str(dateTimeObj.hour+7)
        mn = str(dateTimeObj.minute)
        sc = str(dateTimeObj.second)
        up_filename = yr + mo + day + hr + mn + sc + '.csv'

        target_bucket = 'mst-data-lab'
        target_object = 'output/'+ up_filename
        
        # to define target bucket to load or put the data
        csv_buffer = StringIO()
        newdata.to_csv(csv_buffer, index=False) #store data as csv format
        s3_resource = boto3.resource('s3')
        s3_resource.Object(target_bucket,target_object).put(Body=csv_buffer.getvalue())
        
        # after we put the csv format file we call API GLUE Workflow
        client = boto3.client('glue')
        response = client.start_workflow_run(Name='workflowname')  #workflowname
        print('Lambda function is DONE')

    else:
        print("No new data inserted")

    return


# IMPORTANT

###  We can't just put the codes in the lambda, we must download by our self in Linux environment and collect them into one ZIP file together with the codes.py. After that we store it in a S3 bucket