# 🔍 Multi-Agent Augmented Analytics Workflow
This notebook uses a Large Language Model (LLM) to simulate a multi-agent
system for analyzing data on global shark attacks. The dataset is processed
in chunks using the following specialized agents:

🧩 Data Ingestion Agent  
Reviews the dataset schema, column types, and suggests preprocessing steps.

📊 Statistics Agent  
Performs statistical analysis: mean, median, distributions, and correlations.

🤖 ML Agent  
Recommends appropriate machine learning models and feature engineering
strategies for predicting car prices.

📈 Insight Generation Agent  
Extracts actionable business insights and trends from the data for
decision-making.

Each agent analyzes a portion of the dataset and their insights are later
aggregated to provide a comprehensive understanding of the full dataset.

💡 What are Chunks?  
Chunks are small, manageable subsets of the dataset processed independently
by agents. They enable parallel processing, modularity, and scalability.

💡 Why Use Chunks?  
Chunking allows different agents to analyze data simultaneously, speeding up
processing and supporting distributed reasoning over large datasets.

💡 How is Aggregation Done?  
After each agent analyzes its chunk, results such as means, variances, or
model outputs are combined using statistical methods (e.g., weighted averages,
merged distributions) to produce global insights across the full dataset.

Data Source: https://www.kaggle.com/datasets/mexwell/global-shark-attack?resource=download


## Libraries and settings

In [None]:
# Libraries
import os
import json
import numpy as np
import pandas as pd
from openai import OpenAI

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

# Load OpenAI API key from credentials.json
try:
    with open('../credentials.json', encoding='utf-8') as file:
        credentials = json.load(file)
        API_KEY = credentials['openai']['api_key']
except FileNotFoundError as exc:
    raise ValueError(
        "Please provide OpenAI API key in the credentials.json file."
    ) from exc

# Initialize OpenAI client
client = OpenAI(api_key=API_KEY)

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Show currect working directory
print("Current Working Directory:", os.getcwd())

## Multi-Agent Augmented Analytics Workflow

In [None]:
# ========== CONFIG ==========
CHUNK_SIZE = 5 # Number of rows per chunk
DATA_PATH = "data/global-shark-attack.csv"

# ========== SETUP ==========
columns = ['Date','Year','Type','Country','Area','Location','Activity',
           'Name','Sex','Age','Injury','Fatal', 'Time','Species']
df = pd.read_csv(DATA_PATH, sep=";")[columns].head(10) # first 10 rows for demo
chunks = [df[i:i + CHUNK_SIZE] for i in range(0, len(df), CHUNK_SIZE)]

# ========== AGENTS ==========
agents = [
    {
        "name": "Data Ingestion Agent",
        "role": "data ingestion expert",
        "instruction": (
            "Describe the data schema, column types, and any preprocessing suggestions."
        )
    },
    {
        "name": "Statistics Agent",
        "role": "statistical analyst",
        "instruction": (
            "Analyze summary statistics and correlation (if applicable). Identify trends and anomalies."
        )
    },
    {
        "name": "ML Agent",
        "role": "machine learning engineer",
        "instruction": (
            "Evaluate model performance. Suggest improvements or highlight predictive variables."
        )
    },
    {
        "name": "Insight Generation Agent",
        "role": "business analyst",
        "instruction": (
            "Generate actionable insights from this data chunk. Focus on trends relevant to decision-makers."
        )
    }
]

# ========== GPT CALLER ==========
def call_agent(agent_role, agent_instruction, data_sample):
    messages = [
        {"role": "system", "content": f"You are a {agent_role}."},
        {
            "role": "user",
            "content": (
                f"{agent_instruction}\n\nHere is the relevant data:\n"
                f"{data_sample}"
            )
        }
    ]
    try:
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=messages,
            max_tokens=1000,
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error calling agent: {str(e)}"

# ========== HELPER: STATS ==========
def generate_statistics_text(df_chunk):
    num_cols = ['Age', 'Year']
    df_chunk['Age'] = pd.to_numeric(df_chunk['Age'], errors='coerce')
    stats = df_chunk[num_cols].describe().round(2).to_string()
    corr = df_chunk[num_cols].corr().round(2).to_string()
    return f"Descriptive Statistics:\n{stats}\n\nCorrelation Matrix:\n{corr}"

# ========== HELPER: ML ==========
def train_rf_model(df_chunk):
    df_chunk['Age'] = pd.to_numeric(df_chunk['Age'], errors='coerce')

    # Select features and mock target: "Type" (Unprovoked vs Provoked, etc.)
    feature_cols = ['Country', 'Activity', 'Sex', 'Age']
    target_col = 'Type'

    df_model = df_chunk[feature_cols + [target_col]].dropna()

    if len(df_model) < 10:
        return "Insufficient data for model training in this chunk."

    X_cat = df_model[['Country', 'Activity', 'Sex']]
    X_num = df_model[['Age']]
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_encoded = encoder.fit_transform(X_cat)
    X = np.hstack([X_encoded, X_num.values])
    y = df_model[target_col].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    report = classification_report(y_test, y_pred, zero_division=0)
    return (
        f"Random Forest Classifier trained on {len(X)} samples.\n"
        f"Target: Type (e.g., Unprovoked/Provoked)\n\nClassification Report:\n{report}"
    )

# ========== RUN AGENTS ==========
all_results = {agent['name']: [] for agent in agents}

for i, chunk in enumerate(chunks):
    print(f"\n🔄 Processing Chunk {i+1}/{len(chunks)}")

    for agent in agents:
        print(f"\n-- {agent['name']} on Chunk {i+1} --")

        if agent['name'] == "Statistics Agent":
            data_sample = generate_statistics_text(chunk)
        elif agent['name'] == "ML Agent":
            data_sample = train_rf_model(chunk)
        else:
            data_sample = chunk.head(10).to_csv(index=False)

        output = call_agent(
            agent_role=agent['role'],
            agent_instruction=agent['instruction'],
            data_sample=data_sample
        )

        all_results[agent['name']].append(output)
        print(output)

# ========== AGGREGATE RESULTS ==========
def aggregate_outputs(agent_name, outputs):
    summary_input = "\n\n---\n\n".join(outputs)
    return call_agent(
        agent_role=f"{agent_name} summarizer",
        agent_instruction=(
            f"Summarize the following outputs from {agent_name} across all data chunks."
        ),
        data_sample=summary_input[:30000]
    )

print("\n\n🔎 Final Aggregated Results\n===========================")

for agent in agents:
    print(f"\n📌 {agent['name']}")
    summary = aggregate_outputs(agent['name'], all_results[agent['name']])
    print(summary)

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')