### AWS - Amazon Web Services

### About AWS CLI version 2

In [None]:
!aws --version

In [None]:
# Add credentials to CLI
aws configure --profile christos
aws configure --profile talend_dev
aws configure --profile talend_prd

In [None]:
# How to log into AWS SSO from the command line
# https://www.youtube.com/watch?v=YzNX_YZHPXk
aws configure sso

In [1]:
!aws configure list

In [4]:
!aws configure list-profiles
# default

default


In [None]:
import boto3
# get the Amazon Resource Name (ARN) of an IAM user
# !aws iam list-users --query 'Users[?UserName==`christos`].Arn' --output text
# Create an IAM client
iam_client = boto3.client('iam')

# Get the IAM user ARN
response = iam_client.get_user(UserName='christos')
user_arn = response['User']['Arn']
print(user_arn) # arn:aws:iam::111111:user/christos

In [None]:
# check current AWS CLI profile is using root or IAM credentials:
# query AWS ARN (Amazon Resource Name)
!aws sts get-caller-identity

In [2]:
# # List IAM Roles
# !aws iam list-roles

In [None]:
# give access to IAM user via IAM > Users > ...
# Policy name: IAM_access_sqs
# go to --> https://awspolicygen.s3.amazonaws.com/policygen.html
# Principal: *
# Give your permissions!
# paste arn of service you want to give access
# arn:aws:sqs:eu-central-1:1111:quality_sqs

In [None]:
 # original
{
  "Version": "2012-10-17",
  "Id": "__default_policy_ID",
  "Statement": [
    {
      "Sid": "owner_statement",
      "Effect": "Allow",
      "Principal": {
        "AWS": "arn:aws:iam::111111:root"
      },
      "Action": "SQS:*",
      "Resource": "arn:aws:sqs:eu-central-1:111111:quality_sqs"
    }
  ]
}

In [20]:
# Create an IAM client
# !aws iam list-attached-user-policies --user-name christos
iam_client = boto3.client('iam')

# List attached policies for the IAM user
response = iam_client.list_attached_user_policies(UserName='christos')

# Access the list of policies
policies = response['AttachedPolicies']
policies

[]

### S3 - SImple Storage Service

In [25]:
# !pip install boto3
import boto3, os

# Create a session with your credentials
session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name= aws_region # 'eu-central-1'
)

# Create an S3 client
# aws_region = 'eu-west-3' # eu-west-3: Paris, 'eu-central-1': Frankfurt
s3_client = boto3.client('s3', region_name=aws_region) 
location = {'LocationConstraint': aws_region}

In [26]:
# List all buckets
response = s3_client.list_buckets()

# Print the bucket names
if 'Buckets' in response:
    print('\033[1m' + "Total buckets: {}".format(len(response['Buckets'])) + '\033[0m') # make a string bold
    for bucket in response['Buckets']:
        print(bucket['Name'])

[1mTotal buckets: 7[0m
christos-folder1
christos-folder1-backup
christos-lambda-functions
christos-sqs-lambda-logs
dev-l0
dev-l1
dev-raw


In [None]:
# Create the S3 buckets
s3_client.create_bucket(Bucket='dev-raw', CreateBucketConfiguration=location)
s3_client.create_bucket(Bucket='dev-l0', CreateBucketConfiguration=location)
s3_client.create_bucket(Bucket='dev-l1', CreateBucketConfiguration=location)
print("Amazon S3 buckets has been created")  

# Define the folder structure (prefixes)
folders = ['CE10000', 'KNA1', 'KNVV', 'MARA', 'TVAK']

# Create the folders inside the "dev-raw" bucket
for folder in folders:
    key = f'SAP/{folder}/'
    s3_client.put_object(Bucket='dev-raw', Key=key)
    # print("Amazon S3 folder {} has been created inside 'dev-raw' Bucket".format(key))    

In [None]:
# create the S3 buckets and folders (prefixes) using the AWS CLI
# Create the S3 buckets:
aws s3api create-bucket --bucket dev-raw --region your-region
aws s3api create-bucket --bucket dev-l0 --region your-region
aws s3api create-bucket --bucket dev-l1 --region your-region

# Create the folder structure (prefixes) inside the "dev-raw" bucket:
aws s3api put-object --bucket dev-raw --key SAP/CE10000/
aws s3api put-object --bucket dev-raw --key SAP/KNA1/
aws s3api put-object --bucket dev-raw --key SAP/KNVV/
aws s3api put-object --bucket dev-raw --key SAP/MARA/
aws s3api put-object --bucket dev-raw --key SAP/TVAK/

In [27]:
# Generate a random time between 00:00:00 and 23:59:59
import random
from datetime import datetime, timedelta

def generate_random_time():
    # Define the start and end times
    start_time = datetime.strptime('00:00:00', '%H:%M:%S')
    end_time = datetime.strptime('23:59:59', '%H:%M:%S')

    # Calculate the time range in seconds
    time_range = (end_time - start_time).total_seconds()

    # Generate a random number of seconds within the time range
    random_seconds = random.randint(0, int(time_range))

    # Add the random number of seconds to the start time
    random_time = start_time + timedelta(seconds=random_seconds)

    return random_time.time()

# Generate a random time
random_time = generate_random_time()
print(random_time)

06:51:07


In [None]:
%time
# Create the folder/prefixes structure inside the "dev-raw/SAP/CE10000" bucket for each date & add a csv file.
import random, string
from datetime import datetime, timedelta
folders = ['CE10000', 'KNA1', 'KNVV', 'MARA', 'TVAK']
location = {'LocationConstraint': aws_region}
n_csv = 0

# Get the current date
current_date = datetime.now()

# Loop for the last 30 days
for i in range(30):
    # Calculate the date for each iteration
    date = current_date - timedelta(days=i)
    year = date.strftime("%Y")
    month = date.strftime("%m")
    day = date.strftime("%d")
    
    # Create the folder/prefixes structure inside the "dev-raw" bucket for each date
    for subfolder in folders:
        folder = f"SAP/{subfolder}/EDP_PART_YEAR={year}/EDP_PART_MONTH={month}/EDP_PART_DAY={day}"
        s3_client.put_object(Bucket='dev-raw', Key=folder + '/')
        
        # Generate a random number of CSV files between 1 and 3
        for j in range(random.randint(1, 3)):
            # Generate a random timestamp
            ts = f"{year}{month}{day}{generate_random_time()}".replace(":","")       
            timestamp_folder = f"{folder}/EDP_PART_TS={ts}"
            s3_client.put_object(Bucket='dev-raw', Key=timestamp_folder + '/')

            # Create a CSV file inside the 'EDP_PART_TS' folder
            csv_filename = f"EDP_MKT_{ts}.csv"
            csv_key = f"{timestamp_folder}/{csv_filename}"
            s3_client.put_object(Bucket='dev-raw', Key=csv_key)
            n_csv += 1
            # print(f"Created CSV file: {csv_key}") # csv_key, csv_filename
            print(csv_key) # csv_key, csv_filename   
print("Subfolders & {} CSV files created successfully!".format(n_csv))

In [33]:
# filenames of all CSV files inside the dev-raw/SAP/ bucket from a specific date
from datetime import datetime, timedelta
# Specify the bucket name and prefix
bucket_name = 'dev-raw'
prefix = 'SAP/'

# Specify and format the target date as 'YYYYMMDD' for comparison
target_date = datetime(2023, 6, 15).strftime('%Y%m%d')

# List objects in the bucket with the specified prefix
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# Filter the filenames based on the target date
filtered_files = [obj['Key'].rsplit("/",1)[1] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv') and target_date in obj['Key']]
print(len(filtered_files))
filtered_files

13


['EDP_MKT_20230615132930.csv',
 'EDP_MKT_20230615160848.csv',
 'EDP_MKT_20230615234626.csv',
 'EDP_MKT_20230615093743.csv',
 'EDP_MKT_20230615233535.csv',
 'EDP_MKT_20230615093538.csv',
 'EDP_MKT_20230615231329.csv',
 'EDP_MKT_20230615035259.csv',
 'EDP_MKT_20230615053649.csv',
 'EDP_MKT_20230615173503.csv',
 'EDP_MKT_20230615005254.csv',
 'EDP_MKT_20230615110712.csv',
 'EDP_MKT_20230615201722.csv']

In [None]:
# filenames of all CSV files inside the dev-raw/SAP/ bucket from the previous month or a specific date
# Specify the bucket name and prefix
bucket_name = 'dev-raw'
prefix = 'SAP/'

# Get the current date
current_date = datetime.now()

# Calculate the previous month or specify a specific date
previous_month = current_date - timedelta(days=current_date.day)
specific_date = datetime(2023, 5, 15)  # Example: May 15, 2023

# Define the target date
target_date = specific_date # Use previous_month or specific_date

# Format the target date as 'YYYYMM' for comparison
target_date_formatted = target_date.strftime('%Y%m')

# List objects in the bucket with the specified prefix
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# Get the filenames of all CSV files in the directory
csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

# Filter the filenames based on the target date
filtered_files = [file for file in csv_files if target_date_formatted in file] # whole folder path
# filtered_files = [file.rsplit("/",1)[1] for file in csv_files if target_date_formatted in file] # just the csv file
filtered_files

In [3]:
# Create backups: cp: copy, mv: move & sync: synchronize
# aws s3 cp s3://source-bucket/source-folder/ s3://destination-bucket/destination-folder/timestamp/ --recursive
!aws s3 cp s3://dev-raw/SAP/CE10000/ s3://christos-lambda-functions/Backup/23230706/SAP/CE10000/ --recursive

In [None]:
%%time
# Create backups: cp: copy, mv: move & sync: synchronize
import boto3
from datetime import datetime

# Generate timestamp
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

# Create S3 client
s3_client = boto3.client('s3')

# Copy files to destination folder
source_bucket = 'dev-raw'
source_folder = 'SAP/CE10000'
destination_bucket = 'christos-lambda-functions'
destination_folder = f'Backup/{timestamp}/SAP/CE10000/'

response = s3_client.copy(
    {
        'Bucket': source_bucket,
        'Key': source_folder
    },
    destination_bucket,
    destination_folder,
    ExtraArgs={'ACL': 'bucket-owner-full-control'},
)

# Print the response
print(response)

In [47]:
%%time
# Create backups: cp: copy, mv: move & sync: synchronize
import boto3
import datetime

# Specify the source and destination bucket names and folder paths
source_bucket = 'dev-raw'
source_folder = 'SAP/CE10000/'
destination_bucket = 'christos-lambda-functions'
destination_folder = f'Backup/{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}/SAP/CE10000/'

# Create an S3 client
s3_client = boto3.client('s3')

# Copy the objects from the source folder to the destination folder
response = s3_client.list_objects_v2(Bucket=source_bucket, Prefix=source_folder)

if 'Contents' in response:
    for obj in response['Contents']:
        key = obj['Key']
        destination_key = key.replace(source_folder, destination_folder)
        s3_client.copy_object(Bucket=destination_bucket, Key=destination_key, CopySource={'Bucket': source_bucket, 'Key': key})
else:
    print("No objects found in the source folder.")

print("Copy operation completed.")

Copy operation completed.
CPU times: total: 156 ms
Wall time: 23.3 s


In [None]:
def create_s3_bucket(bucket_name):
    s3_client = boto3.client('s3')
    try:
        s3_client.create_bucket(Bucket=bucket_name)
        print(f"Bucket '{bucket_name}' created successfully.")
    except s3_client.exceptions.BucketAlreadyOwnedByYou:
        print(f"Bucket '{bucket_name}' already exists.")

### SQS - Simple Queue Service

In [None]:
# Retrieve all Amazon SQS queues and their attributes
# Create an SQS client
sqs_client = boto3.client('sqs', region_name='eu-central-1') # Frankfurt

# List all queues
response = sqs_client.list_queues()

# Check if the 'QueueUrls' key exists in the response
if 'QueueUrls' in response:
    # Get the URLs of all queues
    queue_urls = response['QueueUrls']
    
    # Retrieve attributes for each queue
    for queue_url in queue_urls:
        response = sqs_client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=['All'])
        attributes = response['Attributes']
        print(f"Queue URL: {queue_url}")
        # print(f"Attributes: {attributes}\n")
else:
    print("No queues found.")

In [5]:
# Retrieve all Amazon SQS queues
#!aws sqs list-queues --region eu-central-1

In [5]:
# Create a new sqs event using cli ! I use Paris region as default for CLI --prifile
!aws sqs send-message --queue-url https://sqs.eu-central-1.amazonaws.com/1111111/quality_sqs --message-body EDP_MAT_20230707094800.csv --region eu-central-1

{
    "MD5OfMessageBody": "f0e606842daa91e42994fd5be9a27e90",
    "MessageId": "08de730a-f9f8-4f5d-8fb0-df06bc8cfcdc"
}


In [None]:
# Create a New SQS event
### Existed Queue ###
# Create an SQS client
region_name = 'eu-central-1' # 'eu-west-3' 'eu-central-1' # Specify the region
sqs_client = boto3.client('sqs', region_name=region_name)

# Get the URL of the existing queue
queue_name = 'quality_sqs'
response = sqs_client.get_queue_url(QueueName=queue_name)
queue_url = response['QueueUrl']

# import json
# csv = {"csv":"EDP_MAT_20230629125300.csv"}
# message_body = json.dumps({'csv': 'EDP_MAT_20230629135600.csv'})
message_body = 'EDP_MAT_20230706214800.csv'

response = sqs_client.send_message(QueueUrl=queue_url, MessageBody=message_body)
print("Message sent. Response:", response)

In [None]:
### Existed Queue ###
# Create an SQS client
region_name = 'eu-central-1' # 'eu-west-3' 'eu-central-1' # Specify the region
sqs_client = boto3.client('sqs', region_name=region_name)

# Get the URL of the existing queue
queue_name = 'quality_sqs'
response = sqs_client.get_queue_url(QueueName=queue_name)
queue_url = response['QueueUrl']

# Get the attributes of the existing queue
response = sqs_client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=['All'])
attributes = response['Attributes']
attributes

In [None]:
### New SQS dl Queue ###
# Create a new queue with the same settings
response = sqs_client.create_queue(QueueName="data_quality_dl_sqs")
# Get the URL of the new queue
new_queue_url = response['QueueUrl']

# Set configurable attributes for the new queue
sqs_client.set_queue_attributes(
    QueueUrl=new_queue_url,
    Attributes={
        'DelaySeconds': '0',
        #'MaximumMessageSize': '262144',
        'VisibilityTimeout': '30',
        # Add other configurable attributes as needed
    }
)

print(f"New queue URL: {new_queue_url}")

In [None]:
### New SQS Queue ###
# Create a new queue with the same settings
response = sqs_client.create_queue(QueueName="data_quality_sqs")
# Get the URL of the new queue
new_queue_url = response['QueueUrl']

# Set configurable attributes for the new queue
sqs_client.set_queue_attributes(
    QueueUrl=new_queue_url,
    Attributes={
        'DelaySeconds': '0',
        #'MaximumMessageSize': '262144',
        'VisibilityTimeout': '30',
        'RedrivePolicy': '{"deadLetterTargetArn":"arn:aws:sqs:eu-central-1:111111:data_quality_dl_sqs","maxReceiveCount":3}'
        # Add other configurable attributes as needed
    }
)

print(f"New queue URL: {new_queue_url}")

In [None]:
# Get the URL of the existing queue
queue_name = 'data_quality_sqs'
response = sqs_client.get_queue_url(QueueName=queue_name)
queue_url = response['QueueUrl']

# Get the attributes of the existing queue
response = sqs_client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=['All'])
attributes = response['Attributes']
attributes

In [12]:
# To view the details of an SQS message based on its MessageId
!aws sqs receive-message --queue-url https://sqs.eu-central-1.amazonaws.com/491362005797/quality_sqs --message-attribute-names All --attribute-names All --max-number-of-messages 1 --wait-time-seconds 0 --query "Messages[?MessageId=='7ff88a01-57ad-40c9-9096-c164cb897842']" --region eu-central-1

null


In [56]:
# To view the details of an SQS queue
!aws sqs receive-message --queue-url https://sqs.eu-central-1.amazonaws.com/491362005797/quality_sqs --max-number-of-messages 5 --region eu-central-1

In [60]:
# Specify the MessageId of the message you want to retrieve
message_id = "7455a244-6ab6-4ab1-a4f9-8cd9a934af6c"

# Retrieve the message using its MessageId
response = sqs_client.receive_message(
    QueueUrl=queue_url,
    AttributeNames=['All'],
    MessageAttributeNames=['All'],
    MaxNumberOfMessages=1,
    WaitTimeSeconds=0,
    ReceiveRequestAttemptId='string'
)

# Check if the response contains any messages
if 'Messages' in response:
    messages = response['Messages']
    for message in messages:
        # Check if the MessageId matches the desired one
        if message['MessageId'] == message_id:
            # Print the message details
            print("MessageId:", message['MessageId'])
            print("Body:", message['Body'])
            # Other message attributes and metadata can be accessed in a similar way
else:
    print("No messages found")

No messages found


### lambda_handler function

In [63]:
# Event: "EDP_MAT_20230706214800.csv"
# Start Time: 2023-07-06 18:47:41
# Execution Time: 0:00:00.000211 seconds
# Data quality check: 0

In [None]:
start_time = datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S") # time.time()
start_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # time.time()
start_time = datetime.datetime.now() # .replace(" ", "_").replace(":", "_")
start_time

In [None]:
import json
import boto3
import datetime
# from datetime import datetime
import pandas as pd
import numpy as np
import csv
import os
import io


def lambda_handler(event, context):
    # Process the SQS event
    for record in event['Records']:
        # Extract the message body from the event record
        message_body = record['body']
        
        # Process the message body
        process_message(message_body)
    
    # # Return a response
    # return {
    #     'statusCode': 200,
    #     'event': record,
    #     'body': 'SQS event processed successfully'
    # }

def process_message(message_body):
    # Process the message body
    print(f"Received message: {message_body}")
    # Add your logic here to handle the message
    event = message_body
    
    start_time = datetime.datetime.now()
    print("start_time: {}".format(start_time.strftime("%Y-%m-%d %H:%M:%S")))

    # Print the event
    print("event: {}".format(event))

    # Save the log file locally
    ts = start_time.strftime("%Y%m%d%H%M%S")
    object_key = f"{event}_{ts}.txt"
    log_file_name = f"/tmp/{object_key}"

    st = start_time.strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file_name, 'w') as file:
        file.write(f"Event: {json.dumps(event)}\n")
        file.write(f"Start Time: {st}\n")

    # Perform your other logic here
    # extra code here!
    import random
    # Generate a random variable (0 or 1)
    random_var = random.randint(0, 1)
    
    # Check the value of the random variable and print the corresponding message
    if random_var == 0:
        print("Data quality check: identical", random_var)
    else:
        print("Data quality check: Not Identical", random_var)   
    
    # Calculate execution time
    execution_time = datetime.datetime.now() - start_time

    # # Append execution time to the log file
    with open(log_file_name, 'a') as file:
        file.write(f"Execution Time: {execution_time} seconds\n")
        file.write(f"Data quality check: {random_var}\n")
        
    # Upload the log file to S3
    s3_client = boto3.client('s3')
    bucket_name = 'christos-sqs-lambda-logs'
    s3_client.upload_file(log_file_name, bucket_name, object_key)
    
    return {
        'statusCode': 200,
        'body': json.dumps(event)
    }

In [None]:
import json

def lambda_handler(event, context):
    # Process the SQS event
    for record in event['Records']:
        # Extract the message body from the event record
        message_body = record['body']
        
        # Process the message body
        process_message(message_body)
    
    # Return a response
    return {
        'statusCode': 200,
        'body': 'SQS event processed successfully'
    }

def process_message(message_body):
    # Process the message body
    print(f"Received message: {message_body}")
    # Add your logic here to handle the message

In [None]:
# import json
# def lambda_handler(event, context):
#     # TODO implement
#     print(event)
#     # csv = event["csv"]
#     # print(csv)
#     import json
#     return {
#         'statusCode': 200,
#         'body': json.dumps(event)
#     }

import json
import boto3
import datetime
# import time
    
def lambda_handler(event, context):
    # Process the SQS event
    for record in event['Records']:
        # Extract the message body from the event record
        message_body = record['body']
        
        # Process the message body
        process_message(message_body)
    
    # # Return a response
    # return {
    #     'statusCode': 200,
    #     'event': record,
    #     'body': 'SQS event processed successfully'
    # }

def process_message(message_body):
    # Process the message body
    print(f"Received message: {message_body}")
    # Add your logic here to handle the message
    event = message_body
    
    start_time = datetime.datetime.now()
    print("start_time: {}".format(start_time.strftime("%Y-%m-%d %H:%M:%S")))

    # Print the event
    print("event: {}".format(event))

    # Save the log file locally
    ts = start_time.strftime("%Y%m%d%H%M%S")
    object_key = f"{event}_{ts}.txt"
    log_file_name = f"/tmp/{object_key}"

    st = start_time.strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file_name, 'w') as file:
        file.write(f"Event: {json.dumps(event)}\n")
        file.write(f"Start Time: {st}\n")

    # Perform your other logic here
    # extra code here!
    import random
    # Generate a random variable (0 or 1)
    random_var = random.randint(0, 1)
    
    # Check the value of the random variable and print the corresponding message
    if random_var == 0:
        print("Data quality check: identical", random_var)
    else:
        print("Data quality check: Not Identical", random_var)   
    
    # Calculate execution time
    execution_time = datetime.datetime.now() - start_time

    # # Append execution time to the log file
    with open(log_file_name, 'a') as file:
        file.write(f"Execution Time: {execution_time} seconds\n")
        file.write(f"Data quality check: {random_var}\n")
        
    # Upload the log file to S3
    s3_client = boto3.client('s3')
    bucket_name = 'christos-sqs-lambda-logs'
    s3_client.upload_file(log_file_name, bucket_name, object_key)
    
    return {
        'statusCode': 200,
        'body': json.dumps(event),
        "Not Identical:": random_var
    }

In [None]:
# give the Lambda function permission to write to the /tmp directory and upload files to an S3 bucket
import boto3
# Create an IAM client
iam_client = boto3.client('iam', region_name='eu-central-1') #   region_name=region_name 

# Create the IAM role
role_name = 'LambdaS3AccessRole_logs'
role_policy_document = {
    'Version': '2012-10-17',
    'Statement': [
        {
            'Effect': 'Allow',
            'Principal': {
                'Service': 'lambda.amazonaws.com'
            },
            'Action': 'sts:AssumeRole'
        }
    ]
}
response = iam_client.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=json.dumps(role_policy_document)
)

# Attach the policies to the role
role_arn = response['Role']['Arn']
iam_client.attach_role_policy(
    RoleName=role_name,
    PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole'
)
iam_client.attach_role_policy(
    RoleName=role_name,
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess'
)

# Update the Lambda function's execution role
lambda_client = boto3.client('lambda', region_name='eu-central-1')
function_name = 'DataQualityCheck'
lambda_client.update_function_configuration(
    FunctionName=function_name,
    Role=role_arn
)

In [None]:
import json

def lambda_handler(event, context):
    # Process each record in the event
    for record in event['Records']: # event['Records'][0]['body']
        # Retrieve the message body from the record
        message_body = record['body']
        
        # Deserialize the message body into a Python dictionary
        message_data = json.loads(message_body)
        
        # Access the CSV filename from the message data
        csv_filename = message_data['csv']
        
        # Perform any necessary processing with the CSV filename
        print(f"Received CSV filename: {csv_filename}")
        
    # Return a response, if needed
    return {
        'statusCode': 200,
        'body': 'Message processed successfully',
        'csv' : csv_filename
    }

In [None]:
# To create a new Lambda function
function_name = 'new-lambda-function'  # Replace with the desired name for the new function
runtime = 'python3.8'  # Replace with the desired runtime
handler = 'lambda_function.handler'  # Replace with the desired handler function
role_arn = 'arn:aws:iam::11111111111:role/lambda-role'  # Replace with the ARN of the IAM role for the function
timeout = 60  # Replace with the desired timeout in seconds
memory_size = 128  # Replace with the desired memory size in megabytes

response = lambda_client.create_function(
    FunctionName=function_name,
    Runtime=runtime,
    Role=role_arn,
    Handler=handler,
    Timeout=timeout,
    MemorySize=memory_size,
    Publish=True  # Set to True if you want the function to be published
)

# Optionally, you can print the newly created function's ARN
function_arn = response['FunctionArn']
print('New Lambda function ARN:', function_arn)

In [25]:
# # To reproduce a Lambda function from an existing one
# # Create a Boto3 client for Lambda
# lambda_client = boto3.client('lambda', region_name='eu-central-1')

# # Retrieve the existing Lambda function's configuration
# function_name = 'DataQualityCheck'  # actual name of the existing function
# response = lambda_client.get_function(FunctionName=function_name)
# function_configuration = response['Configuration']

# # Retrieve the existing Lambda function's code
# function_code = lambda_client.get_function(FunctionName=function_name)['Code']

# # Create a new Lambda function using the retrieved configuration
# new_function_name = 'new-lambda-function'  # Replace with the desired name for the new function
# response = lambda_client.create_function(
#     FunctionName=new_function_name,
#     Runtime=function_configuration['Runtime'],
#     Role=function_configuration['Role'],
#     Handler=function_configuration['Handler'],
#     Description=function_configuration['Description'],
#     Timeout=function_configuration['Timeout'],
#     MemorySize=function_configuration['MemorySize'],
#     Publish=True  # Set to True if you want the function to be published
# )

# # Set the code for the new Lambda function using the retrieved code
# lambda_client.update_function_code(
#     FunctionName=new_function_name,
#     ZipFile=function_code['Location']
# )

# print('Lambda function reproduced successfully.')

### CloudWatch

In [4]:
# !aws logs filter-log-events --log-group-name /aws/lambda/DataQualityCheck --region eu-central-1

In [None]:
import boto3
from datetime import datetime
# Create a CloudWatch Logs client
client = boto3.client('logs', region_name='eu-central-1')

# Specify the start time and end time in ISO 8601 format
start_time = datetime(2023, 7, 3, 0, 0, 0) #.isoformat()
end_time = datetime(2023, 7, 4, 23, 59, 59) #.isoformat()

# Retrieve log events for a specific log group
response = client.filter_log_events(
    logGroupName='/aws/lambda/DataQualityCheck',
    startTime=int(start_time.timestamp() * 1000),  # Convert to milliseconds
    endTime=int(end_time.timestamp() * 1000)  # Convert to milliseconds
)

# Process the log events
for event in response['events']:
    print(event['message'])
    if event['message'].startswith("REPORT RequestId"):
        print(50*"-","END",50*"-")

### Aurora

In [None]:
import boto3
# Create an RDS client
rds_client = boto3.client('rds', region_name='eu-central-1')

# Create an Aurora database cluster
rds_client.create_db_cluster(
    DBClusterIdentifier='my-db-cluster', # Production template
    Engine='aurora', # Aurora Standard, PostgreSQL 14.6
    # Specify other cluster configuration parameters
) # username: postgres, password: 12345678!
# KMS key ID   alias/aws/rds
# Tag key: devops-guru-default
# Tag value: my-db-cluster

# Cost per resource per hour
# $0.0042 Amazon DevOps Guru pricing 

In [1]:
# Retrieve the attributes of an existing Aurora cluster
import boto3

# Create an RDS client
rds_client = boto3.client('rds', region_name='eu-central-1')

# Specify the identifier of the cluster you want to describe
cluster_identifier = 'my-db-cluster'

# Retrieve the cluster attributes
response = rds_client.describe_db_clusters(DBClusterIdentifier=cluster_identifier)

# Extract the relevant attributes from the response
cluster = response['DBClusters'][0]

# Print the attributes
print("Cluster Identifier:", cluster['DBClusterIdentifier'])
print("Endpoint:", cluster['Endpoint'])
# ... print other attributes as needed

Cluster Identifier: my-db-cluster
Endpoint: my-db-cluster.cluster-c9i3cb16qopx.eu-central-1.rds.amazonaws.com


In [9]:
# Create an RDS client
rds_client = boto3.client('rds', region_name='eu-central-1')

# Retrieve the details of the Aurora cluster
response = client.describe_db_clusters(DBClusterIdentifier=cluster_identifier) # my-db-cluster'

# Extract the list of databases from the response
databases = response['DBClusters'][0]['DBClusterMembers'][0]['DBInstanceIdentifier']

print(databases) # Print the list of databases

my-db-cluster-instance-1


In [None]:
# Create a database within the cluster
rds_client.create_db_instance(
    DBInstanceIdentifier='my-db-instance',
    Engine='aurora-postgresql', # not just aurora
    DBClusterIdentifier='my-db-cluster',
    DBInstanceClass='db.r5.large'  # Example instance class, choose according to your needs
    # Specify other database configuration parameters
)

In [None]:
# Create a table within the database
aurora_client = boto3.client('rds-data', region_name='eu-central-1')
aurora_client.execute_statement(
    secretArn='my-db-secret-arn',
    database='qlt',
    sql='CREATE TABLE data_quality (csv VARCHAR(255) PRIMARY KEY, time_check TIMESTAMP, duration INTEGER, identical BOOLEAN)',
    # Specify other execution parameters
)

In [None]:
import boto3
# Create a Secrets Manager client
secrets_manager_client = boto3.client('secretsmanager')

# Specify the name or identifier of the secret
secret_name = 'my-db-secret'

# Retrieve the secret value
response = secrets_manager_client.describe_secret(SecretId=secret_name)

# Extract the ARN from the response
secret_arn = response['ARN']

# Print the secret ARN
print(f"Secret ARN: {secret_arn}")

In [None]:
import psycopg2

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    host="your_host",
    port="your_port",
    database="your_database",
    user="your_user",
    password="your_password"
)

# Create a cursor to execute SQL queries
cursor = conn.cursor()

# Execute a SQL query
query = "SELECT * FROM your_table"
cursor.execute(query)

# Fetch the query results
results = cursor.fetchall()
for row in results:
    print(row)

# Close the cursor and connection
cursor.close()
conn.close()

In [None]:
# Write the Lambda function to insert data into the table
def lambda_handler(event, context):
    # Extract the necessary data from the event
    csv = event['csv']
    time_check = event['time_check']
    duration = event['duration']
    identical = event['identical']

    # Insert the data into the table
    aurora_client.execute_statement(
        secretArn='my-db-secret-arn',
        database='qlt',
        sql='INSERT INTO data_quality (csv, time_check, duration, identical) VALUES (:csv, :time_check, :duration, :identical)',
        parameters=[
            {'name': 'csv', 'value': {'stringValue': csv}},
            {'name': 'time_check', 'value': {'stringValue': str(time_check)}},
            {'name': 'duration', 'value': {'longValue': duration}},
            {'name': 'identical', 'value': {'booleanValue': identical}},
        ],
        ...
        # Specify other execution parameters
    )

### Extra

In [None]:
import boto3

# Initialize the S3 client
s3_client = boto3.client('s3')

# Define the bucket name and file key
bucket_name = 'your_bucket_name'
file_key = 'path/to/your/file.csv'

# Example 1: Download a file from S3

# Specify the local file path to save the downloaded file
local_file_path = 'path/to/save/local_file.csv'

# Download the file from S3
s3_client.download_file(bucket_name, file_key, local_file_path)
print('File downloaded successfully.')

# Example 2: Upload a file to S3

# Specify the local file path of the file to upload
local_file_path = 'path/to/your/local_file.csv'

# Specify the S3 key for the file
s3_file_key = 'path/to/uploaded/file.csv'

# Upload the file to S3
s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
print('File uploaded successfully.')

# Example 3: List objects in an S3 bucket

# List objects in the bucket
response = s3_client.list_objects(Bucket=bucket_name)

# Iterate over the objects and print their names
if 'Contents' in response:
    for obj in response['Contents']:
        print(obj['Key'])

# Example 4: Read a file directly from S3 into a pandas DataFrame

import pandas as pd

# Specify the S3 object key of the file to read
s3_object_key = 'path/to/your/file.csv'

# Read the file directly from S3 into a DataFrame
df = pd.read_csv(f's3://{bucket_name}/{s3_object_key}')
print(df.head())

# Example 5: Write a pandas DataFrame to a file in S3

# Create a sample DataFrame
data = {'Column1': [1, 2, 3], 'Column2': ['A', 'B', 'C']}
df = pd.DataFrame(data)

# Specify the S3 object key for the file
s3_object_key = 'path/to/write/file.csv'

# Write the DataFrame to a file and upload it to S3
df.to_csv(f's3://{bucket_name}/{s3_object_key}', index=False)
print('DataFrame written and uploaded successfully.')

In [None]:
import pandas as pd
from datetime import datetime
file1_path = 'file1.csv'
file2_path = 'file2.csv'

# Load the CSV file with tab delimiter
df = pd.read_csv(file2_path, delimiter='\t')

# Display the DataFrame
print(df)

In [None]:
# Assuming common_cols is a list of column names
common_cols = ['Col1', 'Col2', 'Col3']

In [None]:
# Read the CSV file into a DataFrame with specified data types
df = pd.read_csv('output.csv', dtype={'column1': int, 'column2': float, 'column3': str})

# Print the DataFrame
print(df)

In [None]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Change column names
new_columns = ['X', 'Y', 'Z']
df.columns = new_columns

# Display the DataFrame with updated column names
print("\nDataFrame with Updated Column Names:")
print(df)

# Assuming df is the DataFrame and file_path is the path to the output Excel file
# Export DataFrame to Excel
df.to_excel(file_path, index=False)

In [None]:
# SOS!
# def lambda_handler(event, context):

In [None]:
# Create an S3 resource
s3_resource = session.resource('s3')

# Specify the bucket name
bucket_name = 'your-bucket-name'

# List objects in the bucket
bucket = s3_resource.Bucket(bucket_name)
for obj in bucket.objects.all():
    print(obj.key)

In [None]:
# Check if the object exists in the bucket
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=file_path)
if 'Contents' in response:
    print("File exists.")
else:
    print("File does not exist.")

In [2]:
# ['kunnr', 'vkorg', 'vtweg', 'spart']

In [None]:
# Specify the bucket name
bucket_name = 'my-bucket'

# List all objects in the bucket
response = s3_client.list_objects_v2(Bucket=bucket_name, Delimiter='/')

# Print the "folder" names (prefixes)
if 'CommonPrefixes' in response:
    for prefix in response['CommonPrefixes']:
        print(prefix['Prefix'])

In [None]:
import csv

# Data to be written to the CSV file
data = [
    ['John', 'Doe', 30],
    ['Jane', 'Smith', 25],
    ['Michael', 'Johnson', 35]
]

# Specify the file path and name for the CSV file
output_file = 'output.csv'

# Create the CSV file and write the data
with open(output_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

print(f"CSV file '{output_file}' has been created.")

In [None]:
import datetime

# Get the current timestamp
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S")

# Replace colons with underscores in the timestamp
filename = timestamp.replace(":", "_") + ".txt"

# Create and write to the file
with open(filename, 'w') as file:
    file.write("This is a test file.")

print(f"File '{filename}' has been created.")

In [4]:
# aws lambda create-function --function-name my-function --runtime python3.9 --handler my_script.lambda_handler --role arn:aws:iam::1234567890:role/my-role --code S3Bucket=my-bucket,S3Key=my-script.zip
# aws lambda invoke --function-name my-function --payload '{}' output.txt

In [5]:
# Can I export an SQS queue to cli script?
# aws sqs send-message --queue-url <queue-url> --message-body "Hello, world!"

In [6]:
# Create an SQS client:
# sqs_client = boto3.client('sqs', region_name='your-region-name', aws_access_key_id='your-access-key', aws_secret_access_key='your-secret-access-key')

In [None]:
# Create an SQS queue:
queue_name = 'your-queue-name'
response = sqs_client.create_queue(QueueName=queue_name)
queue_url = response['QueueUrl']

In [None]:
# Configure the Lambda function:
lambda_client = boto3.client('lambda', region_name='your-region-name', aws_access_key_id='your-access-key', aws_secret_access_key='your-secret-access-key')
lambda_function_name = 'your-lambda-function-name'
lambda_queue_arn = response['QueueArn']
lambda_client.create_event_source_mapping(
    EventSourceArn=lambda_queue_arn,
    FunctionName=lambda_function_name,
    Enabled=True
)

In [None]:
message_body = 'your-message-body'
sqs_client.send_message(QueueUrl=queue_url, MessageBody=message_body)

In [None]:
# How can I create a IAM user with read only access to a specific S3 bucket via aws cli2?
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "S3ReadAccess",
            "Effect": "Allow",
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::your-bucket-name",
                "arn:aws:s3:::your-bucket-name/*"
            ]
        }
    ]
}

In [None]:
# Create the IAM user: Run the following command to create the IAM user:
# aws iam create-user --user-name your-iam-username

In [None]:
# Attach the IAM policy to the user: 
# aws iam put-user-policy --user-name your-iam-username --policy-name s3-read-only-policy --policy-document file://s3-read-only-policy.json

In [None]:
aws lambda create-event-source-mapping \
    --function-name your-lambda-function \
    --event-source-arn your-sqs-queue-arn \
    --batch-size 10 \
    --starting-position LATEST

In [None]:
sqs_client = boto3.client('sqs', region_name='your-region-name')
queue_url = 'your-sqs-queue-url'
argument = 'your-argument'

response = sqs_client.send_message(
    QueueUrl=queue_url,
    MessageBody=argument
)

print("Message sent:", response['MessageId'])

[AWS Pricing Calculator](https://calculator.aws/) <br>
https://calculator.aws/#/addService <br>
https://docs.aws.amazon.com/pricing-calculator/latest/userguide/what-is-pricing-calculator.html <br>
https://aws.amazon.com/redshift/pricing/ <br>
https://aws.amazon.com/s3/pricing/ <br>
https://www.cloudysave.com/aws/cost-calculator/s3-cost-calculator/

[Spotify Data Pipeline: Extract, Transform, and Analyze with AWS](https://github.com/srikantaghosh/Data-Engineering-Spotify-End-to-End-)

In [None]:
# To add dependencies to your Lambda function
lambda_function/
    ├── lambda_function.py
    └── dependencies/
        ├── package1/
        ├── package2/
        └── ...

In [None]:
# Install the required dependencies inside the "dependencies" director
pip install pandas -t dependencies/

In [None]:
import zipfile
import os

# Set the path to your Lambda function code and dependencies directory
function_code_path = 'lambda_function/lambda_function.py'
dependencies_dir = 'lambda_function/dependencies/'

# Set the path for the deployment package
deployment_package_path = 'lambda_function/deployment_package.zip'

# Create a new zip file for the deployment package
with zipfile.ZipFile(deployment_package_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add the Lambda function code to the zip file
    zipf.write(function_code_path, os.path.basename(function_code_path))

    # Add the dependencies to the zip file
    for root, _, files in os.walk(dependencies_dir):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, dependencies_dir))

print('Deployment package created:', deployment_package_path)

In [17]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# Display the original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9
