In [1]:
print("Hello World!")

Hello World!


In [2]:
import pandas as pd

def query_data(data: pd.DataFrame, where: dict = None, group_by: list = None, 
               aggregations: dict = None, order_by: list = None) -> pd.DataFrame:
    """Main function that ties together all the SQL-like operations."""
    # Convert 'year' column to datetime if not already
    if 'year' in data.columns and not pd.api.types.is_datetime64_any_dtype(data['year']):
        data['year'] = pd.to_datetime(data['year'], format='%d.%m.%y')
    
    # Apply operations step by step
    data = apply_where(data, where)
    data = group_and_aggregate(data, group_by, aggregations)
    data = apply_order_by(data, order_by)
    
    return data

def apply_where(data: pd.DataFrame, where: dict) -> pd.DataFrame:
    """Applies WHERE conditions to the DataFrame."""
    if where:
        for column, condition in where.items():
            for operator, value in condition.items():
                # Convert value to datetime only if it's a valid date string
                if isinstance(value, str) and ('.' in value or '-' in value):  # Check if it's a date-like string
                    value = pd.to_datetime(value, format='%d.%m.%y', errors='coerce')  # Use 'coerce' to handle invalid dates
                if operator == '>=':
                    data = data[data[column] >= value]
                elif operator == '<':
                    data = data[data[column] < value]
                elif operator == '=':
                    data = data[data[column] == value]
                elif operator == '!=':
                    data = data[data[column] != value]
                elif operator == '<=':
                    data = data[data[column] <= value]
                elif operator == '>':
                    data = data[data[column] > value]
    return data

def group_and_aggregate(data: pd.DataFrame, group_by: list, aggregations: dict) -> pd.DataFrame:
    """Groups the DataFrame by specified columns and applies aggregations, renaming the resulting columns."""
    if group_by:
        data = data.groupby(group_by, as_index=False).agg(aggregations)
    
    # Rename columns to match SQL-style SELECT aliases
    new_columns = []
    for col in data.columns:
        if isinstance(col, tuple):  # Multi-index columns from aggregation
            new_columns.append(f"{col[0]}_{col[1]}")  # Example: 'Sales_sum'
        else:
            new_columns.append(col)  # Keep original column name

    data.columns = new_columns  # Assign new column names
    return data

def apply_order_by(data: pd.DataFrame, order_by: list) -> pd.DataFrame:
    """Applies ORDER BY sorting to the DataFrame."""
    if order_by:
        for col, ascending in order_by:
            if isinstance(data, pd.DataFrame):  # Check if it's a DataFrame
                data = data.sort_values(by=col, ascending=ascending)
    return data

def execute_query(query_json: dict) -> pd.DataFrame:
    """
    Executes the query by receiving the general JSON-like object with all query parameters.
    The query_json structure should include 'data', 'where', 'group_by', 'aggregations', and 'order_by'.
    """
    # Extract parameters from the provided JSON object
    data = query_json.get("data")
    where = query_json.get("where")
    group_by = query_json.get("group_by")
    aggregations = query_json.get("aggregations")
    order_by = query_json.get("order_by")
    
    # Call query_data with the parameters extracted from the JSON-like object
    result = query_data(data, 
                        where=where, 
                        group_by=group_by, 
                        aggregations=aggregations, 
                        order_by=order_by)
    
    return result

In [19]:
# # Sample usage
# financial_data = pd.DataFrame({
#     'year': ['01.01.21', '02.01.21', '03.01.21', '01.01.22', '02.01.22'],
#     'metrics': ['Income from service', 'Income from production', 'Income from tabloes', 'Income from tabloes', 'Income from production'],
#     'value': [1000, 2000, 3000, 4000, 5000]
# })

In [3]:
import pandas as pd

# Load data from the specified Excel file
file_path = '/Users/amirejibiilia/Desktop/mAIa/sample_maia.xlsx'
financial_data = pd.read_excel(file_path)

# Print the first few rows to ensure it's loaded correctly
financial_data.head()


Unnamed: 0,metrics,year,value,bool,Unnamed: 4,Unnamed: 5,metrics.1,year.1,value.1
0,income from production,2024-01-01,127.12,0.0,,,,NaT,
1,income from production,2024-01-01,270.35,0.0,,,,NaT,
2,income from production,2024-01-01,3406.78,0.0,,,,NaT,
3,income from production,2024-01-01,67276.27,0.0,,,,NaT,
4,income from production,2024-02-01,90.03,0.0,,,,NaT,


In [4]:
financial_data

Unnamed: 0,metrics,year,value,bool,Unnamed: 4,Unnamed: 5,metrics.1,year.1,value.1
0,income from production,2024-01-01,127.12,0.0,,,,NaT,
1,income from production,2024-01-01,270.35,0.0,,,,NaT,
2,income from production,2024-01-01,3406.78,0.0,,,,NaT,
3,income from production,2024-01-01,67276.27,0.0,,,,NaT,
4,income from production,2024-02-01,90.03,0.0,,,,NaT,
...,...,...,...,...,...,...,...,...,...
302,income from technical service,2024-11-01,330.51,0.0,,,,NaT,
303,income from technical service,2024-12-01,254.24,0.0,,,,NaT,
304,income from technical service,2024-12-01,355.93,0.0,,,,NaT,
305,,NaT,,,,,,NaT,


In [5]:
# Example of how to use the function with a custom JSON object
query_json = {
    "data": financial_data,
    "where": {
        "metrics": {"=": "income from service"},
#         "year":{">=":'01.01.22'}
    },
    "group_by": ["metrics"],
    "aggregations": {"value": ["sum"]},
    "order_by": [("value_sum", False)]
}

In [6]:
# Execute the query with the provided JSON object
result = execute_query(query_json)
result

Unnamed: 0,metrics_,value_sum
0,income from service,1427244.36


In [7]:
financial_data

Unnamed: 0,metrics,year,value,bool,Unnamed: 4,Unnamed: 5,metrics.1,year.1,value.1
0,income from production,2024-01-01,127.12,0.0,,,,NaT,
1,income from production,2024-01-01,270.35,0.0,,,,NaT,
2,income from production,2024-01-01,3406.78,0.0,,,,NaT,
3,income from production,2024-01-01,67276.27,0.0,,,,NaT,
4,income from production,2024-02-01,90.03,0.0,,,,NaT,
...,...,...,...,...,...,...,...,...,...
302,income from technical service,2024-11-01,330.51,0.0,,,,NaT,
303,income from technical service,2024-12-01,254.24,0.0,,,,NaT,
304,income from technical service,2024-12-01,355.93,0.0,,,,NaT,
305,,NaT,,,,,,NaT,


In [8]:
financial_data.to_dict(orient="records")

[{'metrics': 'income from production',
  'year': Timestamp('2024-01-01 00:00:00'),
  'value': 127.12,
  'bool': 0.0,
  'Unnamed: 4': nan,
  'Unnamed: 5': nan,
  'metrics.1': nan,
  'year.1': NaT,
  'value.1': nan},
 {'metrics': 'income from production',
  'year': Timestamp('2024-01-01 00:00:00'),
  'value': 270.35,
  'bool': 0.0,
  'Unnamed: 4': nan,
  'Unnamed: 5': nan,
  'metrics.1': nan,
  'year.1': NaT,
  'value.1': nan},
 {'metrics': 'income from production',
  'year': Timestamp('2024-01-01 00:00:00'),
  'value': 3406.78,
  'bool': 0.0,
  'Unnamed: 4': nan,
  'Unnamed: 5': nan,
  'metrics.1': nan,
  'year.1': NaT,
  'value.1': nan},
 {'metrics': 'income from production',
  'year': Timestamp('2024-01-01 00:00:00'),
  'value': 67276.27,
  'bool': 0.0,
  'Unnamed: 4': nan,
  'Unnamed: 5': nan,
  'metrics.1': nan,
  'year.1': NaT,
  'value.1': nan},
 {'metrics': 'income from production',
  'year': Timestamp('2024-02-01 00:00:00'),
  'value': 90.03,
  'bool': 0.0,
  'Unnamed: 4': nan,


In [None]:
financial_data

In [13]:
execute_query({
    "data": financial_data,
    "where": { "metrics": { "=": "income from service" } },
    "group_by": ["metrics"],
    "aggregations": { "value": ["sum"] },
    "order_by": [("value_sum", False)]})

Unnamed: 0,metrics_,value_sum
0,income from service,1427244.36


In [234]:
result = group_and_aggregate(financial_data, ['metrics'], {'value': ['sum', 'min', 'max', 'median'], 'metrics': 'count'})
result

Unnamed: 0,value_sum,value_min,value_max,value_median,metrics_count
0,7000,2000,5000,3500.0,2
1,1000,1000,1000,1000.0,1
2,7000,3000,4000,3500.0,2


In [235]:
query_data(financial_data,
           where = {'year':{'>=':'01.01.22'}},
           group_by = ['metrics'],
           aggregations = {'value': ['sum', 'min', 'max', 'median']},
           order_by = [('value_sum', False)]
          )

Unnamed: 0,metrics_,value_sum,value_min,value_max,value_median
0,Income from production,5000,5000,5000,5000.0
1,Income from tabloes,4000,4000,4000,4000.0


In [240]:
query_data(financial_data,
           where={
               'year': {'>=': '01.01.22'},
               'metrics': {'=': 'Income from production'}
           },
           group_by=['metrics'],  # Group by year to track over time
           aggregations={'value': ['sum', 'min', 'max', 'median']},
#            order_by=[('year', True)]  # Sort by year in ascending order
          )

Unnamed: 0,metrics_,value_sum,value_min,value_max,value_median
0,Income from production,5000,5000,5000,5000.0


In [246]:
query_data(financial_data,
           group_by=['metrics'],  # Group by metrics
           aggregations={'value': ['sum']},  # Calculate sum of the value column
           order_by=[('value_sum', False)]  # Optional: Sort by the summed value in descending order
            )


Unnamed: 0,metrics_,value_sum
0,Income from production,7000
2,Income from tabloes,7000
1,Income from service,1000


In [247]:
query_data(financial_data,
           where={'metrics': {'=': 'Income from production'}},  # Filter by 'Income from production'
           group_by=['metrics'],  # Group by 'metrics'
           aggregations={'value': 'sum'}  # Sum of the 'value' column
          )

Unnamed: 0,metrics,value
0,Income from production,7000
