In [1]:
print("Hello World!")

Hello World!


In [2]:
import pandas as pd

In [3]:
path_name = '/Users/amirejibiilia/Desktop/mAIa/Test_data.xlsx'
data = pd.read_excel(path_name, engine='openpyxl')
data

Unnamed: 0,metrics,year,value
0,income from production,2024-01-01,127.12
1,income from production,2024-01-01,270.35
2,income from production,2024-01-01,3406.78
3,income from production,2024-01-01,67276.27
4,income from production,2024-02-01,90.03
...,...,...,...
300,income from technical service,2024-10-01,533.89
301,income from technical service,2024-11-01,296.61
302,income from technical service,2024-11-01,330.51
303,income from technical service,2024-12-01,254.24


In [145]:
def query_data(data: pd.DataFrame, where: dict = None, group_by: list = None, 
               aggregations: dict = None, order_by: list = None) -> pd.DataFrame:
    """Main function that ties together all the SQL-like operations."""
    # Convert 'date' column to datetime if not already
    if 'date' in data.columns and not pd.api.types.is_datetime64_any_dtype(data['date']):
        data['date'] = pd.to_datetime(data['date'], format='%d.%m.%y')
    
    # Apply operations step by step
    data = apply_where(data, where)
    data = group_and_aggregate(data, group_by, aggregations)
    data = apply_order_by(data, order_by)
    
    return data

def apply_where(data: pd.DataFrame, where: dict) -> pd.DataFrame:
    """Applies WHERE conditions to the DataFrame."""
    if where:
        for column, condition in where.items():
            for operator, value in condition.items():
                # Convert value to datetime if the column is 'date' (or any datetime column)
                if column == "date" and isinstance(value, str):  
                    value = pd.to_datetime(value, format='%Y-%m-%d', errors='coerce')  
                    
                if operator == '>=':
                    data = data[data[column] >= value]
                elif operator == '<':
                    data = data[data[column] < value]
                elif operator == '=':
                    data = data[data[column] == value]
                elif operator == '!=':
                    data = data[data[column] != value]
                elif operator == '<=':
                    data = data[data[column] <= value]
                elif operator == '>':
                    data = data[data[column] > value]
    return data

def group_and_aggregate(data: pd.DataFrame, group_by: list, aggregations: dict) -> pd.DataFrame:
    """Groups the DataFrame by specified columns and applies aggregations, handling time periods."""
    if not group_by:
        # If no grouping is needed, simply apply the aggregation over the whole dataset
        data = pd.DataFrame(data.agg(aggregations)).transpose()
    else:
        # Process special time period groupings
        processed_group_by = []
        data_copy = data.copy()
        
        for group_col in group_by:
            # Handle special time period groupings
            if group_col == 'quarter':
                data_copy['quarter'] = data_copy['date'].dt.to_period('Q').astype(str)
                processed_group_by.append('quarter')
            elif group_col == 'month':
                data_copy['month'] = data_copy['date'].dt.to_period('M').astype(str)
                processed_group_by.append('month')
            elif group_col == 'year_only':
                data_copy['year_only'] = data_copy['date'].dt.year
                processed_group_by.append('year_only')
            elif group_col == 'week':
                data_copy['week'] = data_copy['date'].dt.isocalendar().week
                processed_group_by.append('week')
            else:
                # Regular column
                processed_group_by.append(group_col)
        
        # Perform grouping with processed columns
        data = data_copy.groupby(processed_group_by, as_index=False).agg(aggregations)

    # Rename columns to match SQL-style SELECT aliases
    new_columns = []
    for col in data.columns:
        if isinstance(col, tuple):  # Multi-index columns from aggregation
            new_columns.append(f"{col[0]}_{col[1]}")  # Example: 'Sales_sum'
        else:
            new_columns.append(col)  # Keep original column name

    data.columns = new_columns  # Assign new column names
    return data

def apply_order_by(data: pd.DataFrame, order_by: list) -> pd.DataFrame:
    """Applies ORDER BY sorting to the DataFrame."""
    if order_by:
        for col, ascending in order_by:
            if isinstance(data, pd.DataFrame):  # Check if it's a DataFrame
                data = data.sort_values(by=col, ascending=ascending)
    return data

def execute_query(query_json: dict) -> pd.DataFrame:
    """
    Executes the query by receiving the general JSON-like object with all query parameters.
    The query_json structure should include 'data', 'where', 'group_by', 'aggregations', and 'order_by'.
    """
    # Extract parameters from the provided JSON object
    data = query_json.get("data")
    where = query_json.get("where")
    group_by = query_json.get("group_by")
    aggregations = query_json.get("aggregations")
    order_by = query_json.get("order_by")
    
    # Call query_data with the parameters extracted from the JSON-like object
    result = query_data(data, 
                        where=where, 
                        group_by=group_by, 
                        aggregations=aggregations, 
                        order_by=order_by)
    
    return result


In [172]:
# execute_query({ "data": data, 
#                 "where": { "year": { ">=": "2024-01-01", "<=": "2024-03-31" } }, 
#                "group_by": ["metrics"], "aggregations": { "value": ["mean"] }, 
#                "order_by": [["value_mean", False]] })



In [173]:
import boto3

In [174]:
boto3

<module 'boto3' from '/Users/amirejibiilia/anaconda3/lib/python3.10/site-packages/boto3/__init__.py'>