In [1]:
print("Hello World!")

Hello World!


In [2]:
import pandas as pd

In [3]:
path_name = '/Users/amirejibiilia/Desktop/mAIa/Test_data.xlsx'
data = pd.read_excel(path_name, engine='openpyxl')
data

Unnamed: 0,metrics,year,value
0,income from production,2024-01-01,127.12
1,income from production,2024-01-01,270.35
2,income from production,2024-01-01,3406.78
3,income from production,2024-01-01,67276.27
4,income from production,2024-02-01,90.03
...,...,...,...
300,income from technical service,2024-10-01,533.89
301,income from technical service,2024-11-01,296.61
302,income from technical service,2024-11-01,330.51
303,income from technical service,2024-12-01,254.24


In [145]:
def query_data(data: pd.DataFrame, where: dict = None, group_by: list = None, 
               aggregations: dict = None, order_by: list = None) -> pd.DataFrame:
    """Main function that ties together all the SQL-like operations."""
    # Convert 'year' column to datetime if not already
    if 'year' in data.columns and not pd.api.types.is_datetime64_any_dtype(data['year']):
        data['year'] = pd.to_datetime(data['year'], format='%d.%m.%y')
    
    # Apply operations step by step
    data = apply_where(data, where)
    data = group_and_aggregate(data, group_by, aggregations)
    data = apply_order_by(data, order_by)
    
    return data

def apply_where(data: pd.DataFrame, where: dict) -> pd.DataFrame:
    """Applies WHERE conditions to the DataFrame."""
    if where:
        for column, condition in where.items():
            for operator, value in condition.items():
                # Convert value to datetime if the column is 'year' (or any datetime column)
                if column == "year" and isinstance(value, str):  
                    value = pd.to_datetime(value, format='%Y-%m-%d', errors='coerce')  
                    
                if operator == '>=':
                    data = data[data[column] >= value]
                elif operator == '<':
                    data = data[data[column] < value]
                elif operator == '=':
                    data = data[data[column] == value]
                elif operator == '!=':
                    data = data[data[column] != value]
                elif operator == '<=':
                    data = data[data[column] <= value]
                elif operator == '>':
                    data = data[data[column] > value]
    return data

def group_and_aggregate(data: pd.DataFrame, group_by: list, aggregations: dict) -> pd.DataFrame:
    """Groups the DataFrame by specified columns and applies aggregations, renaming the resulting columns."""
    if group_by:
        # If grouping by certain columns, perform the aggregation
        data = data.groupby(group_by, as_index=False).agg(aggregations)
    else:
        # If no grouping is needed, simply apply the aggregation over the whole dataset
        data = pd.DataFrame(data.agg(aggregations)).transpose()

    # Rename columns to match SQL-style SELECT aliases
    new_columns = []
    for col in data.columns:
        if isinstance(col, tuple):  # Multi-index columns from aggregation
            new_columns.append(f"{col[0]}_{col[1]}")  # Example: 'Sales_sum'
        else:
            new_columns.append(col)  # Keep original column name

    data.columns = new_columns  # Assign new column names
    return data


def apply_order_by(data: pd.DataFrame, order_by: list) -> pd.DataFrame:
    """Applies ORDER BY sorting to the DataFrame."""
    if order_by:
        for col, ascending in order_by:
            if isinstance(data, pd.DataFrame):  # Check if it's a DataFrame
                data = data.sort_values(by=col, ascending=ascending)
    return data

def execute_query(query_json: dict) -> pd.DataFrame:
    """
    Executes the query by receiving the general JSON-like object with all query parameters.
    The query_json structure should include 'data', 'where', 'group_by', 'aggregations', and 'order_by'.
    """
    # Extract parameters from the provided JSON object
    data = query_json.get("data")
    where = query_json.get("where")
    group_by = query_json.get("group_by")
    aggregations = query_json.get("aggregations")
    order_by = query_json.get("order_by")
    
    # Call query_data with the parameters extracted from the JSON-like object
    result = query_data(data, 
                        where=where, 
                        group_by=group_by, 
                        aggregations=aggregations, 
                        order_by=order_by)
    
    return result

def extract_json_from_text(text):
    """
    Extract JSON object from text that might contain markdown and other content.
    """
    # First try to extract JSON from markdown code blocks
    json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', text)
    
    if json_match:
        return json_match.group(1).strip()
    
    # If no code blocks, try to find JSON-like structure
    # Look for the opening/closing braces
    brace_match = re.search(r'(\{[\s\S]*\})', text)
    if brace_match:
        return brace_match.group(1).strip()
    
    # If all else fails, return the original text
    return text

In [146]:
# execute_query({ "data": data, 
#                 "where": { "year": { ">=": "2024-01-01", "<=": "2024-03-31" } }, 
#                "group_by": ["metrics"], "aggregations": { "value": ["mean"] }, 
#                "order_by": [["value_mean", False]] })



In [147]:
execute_query({ "data": data, 
                "where": { "year": { ">=": "2024-01-01", "<=": "2024-03-31" } }, 
               "group_by": ["metrics"], 
               "aggregations": { "value": ["mean"] }, 
               "order_by": [["value_mean", False]] })



Unnamed: 0,metrics_,value_mean
2,income from tabloes,87457.625
0,income from production,10144.291667
1,income from service,5387.484314
3,income from technical service,403.6025


In [152]:
query_json = {
    "data": data,  # Your DataFrame
    "where": None,  # No specific filtering, consider all rows
    "group_by": None,  # No grouping, summing the entire column
    "aggregations": {"value": "sum"},  # Sum the 'value' column
    "order_by": None  # No ordering needed for sum calculation
}
execute_query(query_json)

Unnamed: 0,value
0,1912497.54


In [160]:
execute_query (
{ "data": data, "where": {}, "group_by": [], "aggregations": { "value": ["sum"] }, "order_by": [] })

Unnamed: 0,sum
value,1912497.54
