In [None]:
from dotenv import load_dotenv
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
import agentops
from pydantic import BaseModel, Field
from typing import List
from typing import List, Optional
import os
import json

# Load environment variables
# Get Hugging Face API Key from Environment Variables
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

# Ensure API Key is Provided
if not HUGGINGFACE_API_KEY:
    raise ValueError("Hugging Face API key not found! Set HUGGINGFACE_API_KEY in your environment variables.")

# Define LLM using Hugging Face
basic_llm = LLM(
    model="huggingface/mistralai/Mistral-7B-Instruct-v0.3",
    api_key=HUGGINGFACE_API_KEY,
    temperature=0
)

In [6]:
no_keywords = 10

about_company = "Axiora is a company that provides advanced artificial intelligence solutions focused on data analysis, intelligent insights extraction, and data visualization."
company_context = StringKnowledgeSource(
    content=about_company
)

In [7]:
output_dir = "./ai-agent-output"
os.makedirs(output_dir, exist_ok=True)

In [14]:
from pydantic import BaseModel, Field
from typing import List, Dict, Any

class DataUnderstandingOutput(BaseModel):
    data_type: List[str] = Field(
        ...,
        title="Identified Data Type(s)",
        description=(
            "Clearly defines the nature of the data such as: "
            "'structured', 'unstructured', 'time-series', 'image', 'text', etc."
        )
    )
    structure: Dict[str, Any] = Field(
        ...,
        title="Data Structure Summary",
        description=(
            "Details how the data is organized. Includes format (e.g., CSV, JSON, Excel), "
            "column names, data types, and a few representative sample values."
        )
    )
    suggested_analysis: List[str] = Field(
        ...,
        title="Recommended Analytical or Visualization Techniques",
        description=(
            "List of suitable chart types or analytical approaches for this type of data. "
            "For example: bar chart, line graph, scatter plot, word cloud, or summary statistics."
        )
    )
    
data_type_detection_agent = Agent(
    role="Data Type Detection Agent",
    goal="\n".join([
        "Thoroughly analyze the provided input data {data} to accurately identify its type.",
        "Determine whether the data is structured, unstructured, time-series, image, text, or any other relevant format.",
        "Leverage this understanding to recommend the most suitable chart types for effective data visualization.",
        "Return the identified data type(s) and corresponding chart suggestions in a clear, structured, and professional format."
    ]),
    backstory="This agent is designed to identify the type of input data and recommend appropriate visualization techniques to facilitate effective analysis and decision-making.",
    llm=basic_llm,
    verbose=True,
)

data_type_detection_task = Task(
    description="\n".join([
        "Analyze the provided dataset: {data}.",
        "Determine the type of the input data with high accuracy.",
        "Identify whether the data is structured (e.g., tabular data, CSV), unstructured (e.g., free text, documents), time-series, image, or any other relevant format.",
        "Focus solely on classifying the nature and format of the data based on its structure and content.",
        "Return the classification in a clean and well-structured JSON format."
    ]),
    expected_output="A JSON object containing the detected data type(s) and a brief description of the data structure.",
    output_json=DataUnderstandingOutput,
    output_file=os.path.join(output_dir, "step_1_data_type_detection.json"),
    agent=data_type_detection_agent
)

# === Step 6: Setup the crew ===
data_type_detection_crew = Crew(
    agents=[data_type_detection_agent],
    tasks=[data_type_detection_task],
    verbose=True
)

data_type_detection_crew = Crew(
    agents=[data_type_detection_agent],
    tasks=[data_type_detection_task],
    verbose=True
)

crew_results = data_type_detection_crew.kickoff(
    inputs={
        "data": "sales.csv"
    })


Overriding of current TracerProvider is not allowed
Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mData Type Detection Agent[00m
[95m## Task:[00m [92mAnalyze the provided dataset: sales.csv.
Determine the type of the input data with high accuracy.
Identify whether the data is structured (e.g., tabular data, CSV), unstructured (e.g., free text, documents), time-series, image, or any other relevant format.
Focus solely on classifying the nature and format of the data based on its structure and content.
Return the classification in a clean and well-structured JSON format.[00m


🖇 AgentOps: Could not end session - no sessions detected




[1m[95m# Agent:[00m [1m[92mData Type Detection Agent[00m
[95m## Final Answer:[00m [92m
{
  "data_type": ["structured", "tabular"],
  "structure": {
    "columns": ["Order ID", "Product", "Sales", "Date"],
    "data_types": {
      "Order ID": "integer",
      "Product": "string",
      "Sales": "float",
      "Date": "datetime"
    },
    "row_count": 1000,
    "column_count": 4
  },
  "suggested_analysis": [
    "Line chart for Sales over time",
    "Bar chart for Sales by Product",
    "Scatter plot for Sales vs. Date",
    "Pie chart for Sales distribution by Product"
  ]
}[00m




In [13]:
from dotenv import load_dotenv
from crewai import Agent, Task, Crew, LLM
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from pydantic import BaseModel, Field
from typing import List, Dict, Any
import os

# Load environment variables
load_dotenv()

# Get Hugging Face API Key from Environment Variables
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
if not HUGGINGFACE_API_KEY:
    raise ValueError("Hugging Face API key not found! Set HUGGINGFACE_API_KEY in your environment variables.")

# Define LLM using Hugging Face
basic_llm = LLM(
    model="huggingface/mistralai/Mistral-7B-Instruct-v0.3",
    api_key=HUGGINGFACE_API_KEY,
    temperature=0
)

# Create output directory
output_dir = "./ai-agent-output"
os.makedirs(output_dir, exist_ok=True)

# Company context
about_company = "Axiora is a company that provides advanced artificial intelligence solutions focused on data analysis, intelligent insights extraction, and data visualization."
company_context = StringKnowledgeSource(content=about_company)

# Output model
class DataUnderstandingOutput(BaseModel):
    data_type: List[str] = Field(
        ..., title="Identified Data Type(s)",
        description="Clearly defines the nature of the data such as: 'structured', 'unstructured', 'time-series', 'image', 'text', etc."
    )
    structure: Dict[str, Any] = Field(
        ..., title="Data Structure Summary",
        description="Details how the data is organized. Includes format (e.g., CSV, JSON, Excel), column names, data types, and a few sample values."
    )
    suggested_analysis: List[str] = Field(
        ..., title="Recommended Analytical or Visualization Techniques",
        description="Suggested chart types or analytical approaches suitable for this data."
    )

# Define agent
data_type_detection_agent = Agent(
    role="Data Type Detection Agent",
    goal="\n".join([
        "Thoroughly analyze the provided input data {data} to accurately identify its type.",
        "Determine whether the data is structured, unstructured, time-series, image, text, or any other relevant format.",
        "Leverage this understanding to recommend the most suitable chart types for effective data visualization.",
        "Return the identified data type(s) and corresponding chart suggestions in a clear, structured, and professional format."
    ]),
    backstory="This agent is designed to identify the type of input data and recommend appropriate visualization techniques to facilitate effective analysis and decision-making.",
    llm=basic_llm,
    verbose=True,
)

# Define task
data_type_detection_task = Task(
    description="\n".join([
        "Analyze the provided dataset: {data}.",
        "Determine the type of the input data with high accuracy.",
        "Identify whether the data is structured (e.g., tabular data, CSV), unstructured (e.g., free text, documents), time-series, image, or any other relevant format.",
        "Focus solely on classifying the nature and format of the data based on its structure and content.",
        "Return the classification in a clean and well-structured JSON format."
    ]),
    expected_output="A JSON object containing the detected data type(s) and a brief description of the data structure.",
    output_json=DataUnderstandingOutput,
    output_file=os.path.join(output_dir, "step_1_data_type_detection.json"),
    agent=data_type_detection_agent
)

# Setup the crew
data_type_detection_crew = Crew(
    agents=[data_type_detection_agent],
    tasks=[data_type_detection_task],
    verbose=True
)

crew_results = data_type_detection_crew.kickoff(
    inputs={
        "data": "sales.csv"
    })


Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mData Type Detection Agent[00m
[95m## Task:[00m [92mAnalyze the provided dataset: sales.csv.
Determine the type of the input data with high accuracy.
Identify whether the data is structured (e.g., tabular data, CSV), unstructured (e.g., free text, documents), time-series, image, or any other relevant format.
Focus solely on classifying the nature and format of the data based on its structure and content.
Return the classification in a clean and well-structured JSON format.[00m


🖇 AgentOps: Could not end session - no sessions detected




[1m[95m# Agent:[00m [1m[92mData Type Detection Agent[00m
[95m## Final Answer:[00m [92m
{
  "data_type": ["structured", "tabular"],
  "structure": {
    "columns": ["Order ID", "Product", "Sales", "Date"],
    "data_types": {
      "Order ID": "integer",
      "Product": "string",
      "Sales": "float",
      "Date": "datetime"
    },
    "row_count": 1000,
    "column_count": 4
  },
  "suggested_analysis": [
    "Line chart for Sales over time",
    "Bar chart for Sales by Product",
    "Scatter plot for Sales vs. Date",
    "Pie chart for Sales distribution by Product"
  ]
}[00m




In [1]:
from crewai import Agent, Task, Crew, LLM
from pydantic import BaseModel, Field
from typing import List, Dict
import os
import pandas as pd
from dotenv import load_dotenv

# Load environment variables (if needed)
load_dotenv()

# Setup LLM (using Ollama locally)
llm = LLM(
    model="ollama/mistral:instruct",
    base_url="http://localhost:11434",
    temperature=0,
)

# Create output directory
output_dir = "./ai-agent-output"
os.makedirs(output_dir, exist_ok=True)

# --------- 1. Define Output Schema ---------
class ColumnInfo(BaseModel):
    column_name: str = Field(..., description="Name of the column")
    data_type: str = Field(..., description="Data type of the column (e.g., int, float, string, datetime)")

class DataUnderstandingOutput(BaseModel):
    data_type: str = Field(..., description="Overall type of the dataset: structured, unstructured, time-series, etc.")
    columns: List[ColumnInfo] = Field(..., description="List of column names with their data types")

# --------- 2. Load and Preview the Dataset ---------
# Load your dataset (replace this with the path to your file)
df = pd.read_csv("sales.csv")

# Get the first few rows as preview
data_preview = df.head(5).to_json(orient="records", lines=False)

# Get column schema
column_schema = df.dtypes.apply(lambda x: str(x)).to_dict()

# Convert to formatted string for prompt
formatted_column_schema = "\n".join([f"- {col}: {dtype}" for col, dtype in column_schema.items()])

# --------- 3. Build Prompt Template ---------
prompt_template = f"""
You are given a sample of a dataset including its first 5 rows and column schema.

Your task is to:
1. Accurately determine the overall type of the dataset.
   - Choose one or more of the following: "structured", "unstructured", "time-series", "text", "image", etc.
2. List all column names along with their corresponding data types (such as int64, float64, object, datetime64, etc.).

Do not perform any data cleaning, visualization suggestions, or further analysis.

Respond only in structured JSON format that conforms exactly to the following schema:

{{
  "data_type": "<overall_data_type>",
  "columns": [
    {{ "column_name": "<name1>", "data_type": "<type1>" }},
    ...
  ]
}}

### Dataset Preview:
{data_preview}

### Column Schema:
{formatted_column_schema}
"""

# --------- 4. Define Agent ---------
agent = Agent(
    role="Data Type Detection Agent",
    goal="Identify the overall type of a dataset and list its columns and their data types.",
    backstory="This agent specializes in understanding data formats for structured and unstructured datasets.",
    llm=llm,
    verbose=True,
)

# --------- 5. Define Task ---------
task = Task(
    description=prompt_template,
    expected_output="JSON object with 'data_type' and 'columns' fields describing the dataset structure.",
    output_json=DataUnderstandingOutput,
    output_file=os.path.join(output_dir, "data_structure_analysis.json"),
    agent=agent,
)

# --------- 6. Run the Crew ---------
crew = Crew(
    agents=[agent],
    tasks=[task],
    verbose=True,
)

result = crew.kickoff()
print(result)


[1m[95m# Agent:[00m [1m[92mData Type Detection Agent[00m
[95m## Task:[00m [92m
You are given a sample of a dataset including its first 5 rows and column schema.

Your task is to:
1. Accurately determine the overall type of the dataset.
   - Choose one or more of the following: "structured", "unstructured", "time-series", "text", "image", etc.
2. List all column names along with their corresponding data types (such as int64, float64, object, datetime64, etc.).

Do not perform any data cleaning, visualization suggestions, or further analysis.

Respond only in structured JSON format that conforms exactly to the following schema:

{
  "data_type": "<overall_data_type>",
  "columns": [
    { "column_name": "<name1>", "data_type": "<type1>" },
    ...
  ]
}

### Dataset Preview:
[{"orderid":1,"Customer Name":"Muhammed MacIntyre","shipmode":"First Class","sales":825.174,"quantity":9,"discount":0.3,"profit":-117.882,"segment":"Corporate","region":"Central","state":"Illinois","subcateg

🖇 AgentOps: Could not end session - no sessions detected




[1m[95m# Agent:[00m [1m[92mData Type Detection Agent[00m
[95m## Final Answer:[00m [92m
{
  "data_type": "structured",
  "columns": [
    { "column_name": "orderid", "data_type": "int64" },
    { "column_name": "Customer Name", "data_type": "object" },
    { "column_name": "shipmode", "data_type": "object" },
    { "column_name": "sales", "data_type": "float64" },
    { "column_name": "quantity", "data_type": "int64" },
    { "column_name": "discount", "data_type": "float64" },
    { "column_name": "profit", "data_type": "float64" },
    { "column_name": "segment", "data_type": "object" },
    { "column_name": "region", "data_type": "object" },
    { "column_name": "state", "data_type": "object" },
    { "column_name": "subcategory", "data_type": "object" },
    { "column_name": "category", "data_type": "object" },
    { "column_name": "orderdate_day", "data_type": "int64" },
    { "column_name": "orderdate_weekday", "data_type": "object" },
    { "column_name": "orderdate_mo