In [50]:
#Imports
import os
import re
import json
import random
import requests
import openai
from dotenv import load_dotenv

In [59]:
load_dotenv()

MODEL_BACKEND = os.getenv("MODEL_BACKEND", "ollama")
MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE", "http://localhost:11434")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [60]:
#confiig
INPUT_FILE = "ML-data.txt"
OUTPUT_FILE = "ML-data.json"

In [61]:
# === CONFIGURE OPENAI ===
if MODEL_BACKEND == "openai":
    openai.api_key = OPENAI_API_KEY

In [62]:
# === READ RAW PROJECTS ===
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    raw_data = f.read()

project_blocks = re.split(r'\n?\s*\d+\.\s+', raw_data.strip())
project_blocks = [p.strip() for p in project_blocks if p.strip()]

In [63]:
def call_llm_ollama(title, description):
    system_prompt = (
        "You are a dataset formatter AI. You must return only valid JSON with two keys: "
        "'tech_stack' and 'objective'. No markdown, no commentary, no extra text."
    )

    user_prompt = f"""
### Example JSON Format
{{
  "tech_stack": "Python, Flask, TensorFlow, HTML, CSS",
  "objective": "To develop a web-based ML platform that predicts or automates a specific domain problem."
}}

### Project Details
Title: {title}
Description: {description}

Now output valid JSON in the same format as the example above.
"""

    try:
        response = requests.post(
            f"{OLLAMA_API_BASE}/api/generate",
            json={
                "model": MODEL_NAME,
                "prompt": f"{system_prompt}\n{user_prompt}",
                "stream": False,
                "options": {"temperature": 0.2}  # More deterministic
            },
            timeout=60
        )
        data = response.json()
        output = data.get("response", "").strip()

        # --- Cleanup ---
        cleaned = (
            output.replace("```json", "")
            .replace("```", "")
            .strip()
        )
        start, end = cleaned.find("{"), cleaned.rfind("}") + 1
        json_str = cleaned[start:end]
        return json.loads(json_str)

    except Exception as e:
        print(f"⚠️ Ollama parse error for {title}: {e}")
        return {
            "tech_stack": "Python, Flask (fallback)",
            "objective": "Automatic generation failed; placeholder added."
        }


In [64]:
def flatten_tech_stack(tech_stack):
    """
    Converts tech_stack (dict, list, or string) into a clean comma-separated string.
    Handles LLM outputs like {"front-end": [...]} or ["Python", "Flask"].
    """
    if isinstance(tech_stack, dict):
        # Merge all list values into one list
        all_items = []
        for v in tech_stack.values():
            if isinstance(v, list):
                all_items.extend(v)
            else:
                all_items.append(str(v))
        return ", ".join(all_items)
    elif isinstance(tech_stack, list):
        return ", ".join(map(str, tech_stack))
    else:
        return str(tech_stack)


In [65]:
# === MAIN PROCESSING LOOP ===
structured_projects = []

for idx, block in enumerate(project_blocks, start=1):  # process all projects
    lines = block.split('\n', 1)
    title = lines[0].strip()
    description = lines[1].strip() if len(lines) > 1 else ""

    # Domain guess
    desc_lower = description.lower()
    if any(x in desc_lower for x in ["ai", "machine learning", "deep learning", "data"]):
        domain = "Artificial Intelligence / Data Science"
    elif any(x in desc_lower for x in ["iot", "sensor", "arduino", "raspberry"]):
        domain = "IoT / Embedded Systems"
    elif any(x in desc_lower for x in ["app", "mobile", "fitness", "health"]):
        domain = "Mobile / HealthTech"
    elif any(x in desc_lower for x in ["web", "ecommerce", "dashboard"]):
        domain = "Web / Software Engineering"
    else:
        domain = "General Computing"

    prompt = f"""
You are a dataset preparation assistant. Given this project:

Title: {title}
Description: {description}

Generate:
1. A realistic tech stack (frameworks, libraries, languages)
2. A concise 2-3 line project objective

Output strictly in JSON with keys: tech_stack, objective
"""

    ai_output = call_llm(prompt)

    # Debug log
    print(f"\n---- Raw Output for {title} ----\n{ai_output}\n-----------------------------\n")

    # Clean & parse JSON
    try:
        cleaned_output = (
            ai_output.strip()
            .replace("```json", "")
            .replace("```", "")
            .strip()
        )
        start = cleaned_output.find("{")
        end = cleaned_output.rfind("}") + 1
        json_str = cleaned_output[start:end]
        parsed = json.loads(json_str)
    except Exception as e:
        print(f"⚠️ JSON parse failed for {title}: {e}")
        parsed = {
            "tech_stack": "Python, Flask (fallback)",
            "objective": "Automatic generation failed; placeholder added."
        }

    year = str(random.randint(2020, 2024))

    structured_projects.append({
        "project_id": f"auto{idx:03}",
        "title": title,
        "description": description,
        "domain": domain,
        "year": year,
        "tech_stack": flatten_tech_stack(parsed.get("tech_stack", "Python, Flask (fallback)")),
        "objective": parsed.get("objective", "Automatic generation failed; placeholder added."),
        "source": "ISE-dept"
    })


    print(f"✅ Processed: {title}")



---- Raw Output for Instagram Reach Analysis ----
Here is the output in JSON format:

```
{
    "tech_stack": {
        "languages": ["Python"],
        "libraries": [
            "Pandas",
            "NumPy",
            "Matplotlib",
            "Seaborn",
            "Plotly"
        ],
        "frameworks": null
    },
    "objective": "Analyze Instagram reach data to identify factors influencing post visibility using Python libraries."
}
```
-----------------------------

✅ Processed: Instagram Reach Analysis

---- Raw Output for Scraping laptop data from Amazon ----
Here is the output in JSON format:

{"tech_stack": "Python 3.9, BeautifulSoup4, requests, pandas, NumPy", "objective": "Scrape laptop data from Amazon to aid in price comparison and market research."}
-----------------------------

✅ Processed: Scraping laptop data from Amazon

---- Raw Output for Video Game Sales Prediction ----
Here is the output in JSON format:

```
{
  "tech_stack": {
    "programming_language":

In [66]:

# === SAVE FINAL JSON ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(structured_projects, f, indent=2, ensure_ascii=False)

print(f"\n✅ Done! {len(structured_projects)} projects saved → {OUTPUT_FILE}")


✅ Done! 15 projects saved → ML-data.json
