Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added AWS S3 Query runner #6755

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file added client/app/assets/images/db-logos/s3.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "redash-client",
"version": "24.03.0-dev",
"version": "24.06.0-dev",
"description": "The frontend part of Redash.",
"main": "index.js",
"scripts": {
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ force-exclude = '''

[tool.poetry]
name = "redash"
version = "24.03.0-dev"
version = "24.06.0-dev"
description = "Make Your Company Data Driven. Connect to any data source, easily visualize, dashboard and share your data."
authors = ["Arik Fraimovich <arik@redash.io>"]
# to be added to/removed from the mailing list, please reach out to Arik via the above email or Discord
Expand Down
2 changes: 1 addition & 1 deletion redash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from redash.destinations import import_destinations
from redash.query_runner import import_query_runners

__version__ = "24.03.0-dev"
__version__ = "24.06.0-dev"


if os.environ.get("REMOTE_DEBUG"):
Expand Down
114 changes: 114 additions & 0 deletions redash/query_runner/s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import boto3
import pandas as pd
from redash.query_runner import BaseQueryRunner, register
from redash.query_runner import TYPE_STRING, TYPE_INTEGER, TYPE_BOOLEAN, TYPE_FLOAT, TYPE_DATE, TYPE_DATETIME
from redash.utils import json_dumps, json_loads
import logging

TYPES_MAP = {
"bool": TYPE_BOOLEAN,
"datetime64[ns]": TYPE_DATETIME,
"datetime64[s]": TYPE_DATETIME,
"float64": TYPE_FLOAT,
"int64": TYPE_INTEGER,
"object": TYPE_STRING
}

logger = logging.getLogger(__name__)

class S3(BaseQueryRunner):
@classmethod
def name(cls):
return "Amazon S3"
@classmethod
def configuration_schema(cls):
return {
"type": "object",
"properties": {
"region": {"type": "string", "title": "AWS Region"},
"bucket_name": {"type": "string", "title": "Bucket Name"},
"object_key": {"type": "string", "title": "Object Key"}
},
"required": ["region", "bucket_name", "object_key"],
"order": ["region", "bucket_name", "object_key"],
}
def test_connection(self):
region = self.configuration["region"]
bucket_name = self.configuration["bucket_name"]
object_key = self.configuration["object_key"]

# Set S3 client using Boto3
s3_client = boto3.client("s3")

query = "SELECT * from S3Object"
# As of now we are required to pass in the object key so we are configuring the data source to a particular S3 object temporarily
resp = s3_client.select_object_content(
Bucket=bucket_name,
Key= object_key, # We need the CSV file (Object Key)
ExpressionType='SQL',
Expression=query,
InputSerialization = {'CSV': {"FileHeaderInfo": "Use"}, 'CompressionType': 'NONE'},
OutputSerialization = {'JSON': {}},
)

# Need to first deploy this to see how response data schema is before we can parse it into rows/columns
for event in resp['Payload']:
if 'Records' in event:
records = event['Records']['Payload']
logger.info("Records: %s", records)

def run_query(self, query, user):
region = self.configuration["region"]
bucket_name = self.configuration["bucket_name"]
object_key = self.configuration["object_key"]

# Set S3 client using Boto3
s3_client = boto3.client("s3")

# As of now we are required to pass in the object key so we are configuring the data source to a particular S3 object temporarily
resp = s3_client.select_object_content(
Bucket=bucket_name,
Key= object_key, # We need the CSV file (Object Key)
ExpressionType='SQL',
Expression=query,
InputSerialization = {'CSV': {"FileHeaderInfo": "Use"}, 'CompressionType': 'NONE'},
OutputSerialization = {'JSON': {}},
)

# Need to first deploy this to see how response data schema is before we can parse it into rows/columns
json_result = ""
for event in resp['Payload']:
if 'Records' in event:
json_result = event['Records']['Payload']
logger.info("Records: %s", json_result)

json_result = json_result.decode('utf8')
json_result = json_result.replace('\n', '')
json_result = json_result.replace('\\r', '')
json_result = json_result.replace('}{', '},{')
json_result = "[" + json_result + "]"
logger.info("JSON: %s", json_result)
dict_result = json_loads(json_result)
logger.info("DictResult: %s", dict_result)
df = pd.DataFrame(dict_result)
logger.info("DataFrame: %s", df.to_string())
columns = []
rows = df.to_dict('records')

for col in df.columns:
columns.append(
{
"name": col,
"friendly_name": col,
"type": TYPES_MAP[str(df[col].dtype)]
}
)

# Returning the query results in Redash format
data = {"columns": columns, "rows": rows}
error = None
json_data = json_dumps(data)
return json_data, error

# Registering custom S3 query runner
register(S3)
1 change: 1 addition & 0 deletions redash/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ def email_server_is_configured():
"redash.query_runner.google_spreadsheets",
"redash.query_runner.graphite",
"redash.query_runner.mongodb",
"redash.query_runner.s3",
"redash.query_runner.couchbase",
"redash.query_runner.mysql",
"redash.query_runner.pg",
Expand Down