In [10]:
from typing import TypedDict, Dict, Literal

In [11]:
class CodeState(TypedDict):
    file_map: Dict[str, str]               # Original code
    analysis_result: Literal["Upgrade", "Rewrite", "No change"] # 'upgrade', 'rewrite', etc.
    analysis_feedback: str 
    # updated_file_map: Dict[str, str]       # Modernized code
    # failed_files: list
    # feedback_result: Dict[str, bool]
    # documentation: str

In [12]:
# import zipfile
# def extract_zip_to_file_map(zip_path):
#     """
#     Extracts all files from a ZIP archive into a dictionary mapping file paths to their contents.
    
#     Returns:
#         file_map (dict): {file_path_in_zip: content (str or bytes)}
#     """
#     file_map = {}
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         for file in zip_ref.namelist():
#             if file.startswith('__MACOSX/') or file.endswith('/'):
#                 continue  # Skip macOS metadata and directories
#             with zip_ref.open(file) as f:
#                 data = f.read()
#                 try:
#                     file_map[file] = data.decode('utf-8')
#                 except UnicodeDecodeError:
#                     file_map[file] = data  # Keep as bytes if not UTF-8 decodable
#     return file_map

In [13]:
import zipfile
import os
from pathlib import Path

def extract_to_file_map(path):
    """
    Extracts files from a ZIP archive or a directory into a dictionary
    mapping relative file paths to their contents.

    Args:
        path (str or Path): Path to a ZIP file or directory.

    Returns:
        dict: {relative_path: content (str or bytes)}
    """
    path = Path(path)
    file_map = {}

    if not path.exists():
        print(f"Path does not exist: {path}")
        return file_map

    if path.is_file() and path.suffix.lower() == '.zip':
        print(f"Extracting from ZIP file: {path}")
        try:
            with zipfile.ZipFile(path, 'r') as zip_ref:
                for file in zip_ref.namelist():
                    if file.startswith('__MACOSX/') or file.endswith('/'):
                        continue  # Skip macOS metadata and directories
                    with zip_ref.open(file) as f:
                        data = f.read()
                        try:
                            file_map[file] = data.decode('utf-8')
                        except UnicodeDecodeError:
                            file_map[file] = data  # Keep as bytes if not UTF-8 decodable
        except zipfile.BadZipFile:
            print(f"Invalid ZIP file: {path}")
    elif path.is_dir():
        print(f"Scanning directory: {path}")
        for file_path in path.rglob('*'):
            if file_path.is_dir():
                continue
            relative_path = str(file_path.relative_to(path))
            print(f"Reading file: {relative_path}")
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_map[relative_path] = f.read()
            except UnicodeDecodeError:
                with open(file_path, 'rb') as f:
                    file_map[relative_path] = f.read()
            except Exception as e:
                print(f"Failed to read {relative_path}: {e}")
    else:
        print(f"Path is neither a ZIP file nor a directory: {path}")

    if not file_map:
        print("No files found or read.")

    return file_map


In [84]:
map = extract_to_file_map("../code")

Scanning directory: ..\code
Reading file: main.py
Reading file: math_utils.py


In [85]:
map

{'main.py': '# main.py\n\nfrom math_utils import add, subtract, multiply, divide\n\ndef main():\n    x = 10\n    y = 5\n\n    print(f"Adding {x} and {y}: {add(x, y)}")\n    print(f"Subtracting {y} from {x}: {subtract(x, y)}")\n    print(f"Multiplying {x} and {y}: {multiply(x, y)}")\n    try:\n        print(f"Dividing {x} by {y}: {divide(x, y)}")\n    except ValueError as e:\n        print(f"Error: {e}")\n\nif __name__ == "__main__":\n    main()\n',
 'math_utils.py': '# math_utils.py\n\ndef add(a, b):\n    return a + b\n\ndef subtract(a, b):\n    return a - b\n\ndef multiply(a, b):\n    return a * b\n\ndef divide(a, b):\n    if b == 0:\n        raise ValueError("Cannot divide by zero.")\n    return a / b'}

In [69]:
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableLambda
from langchain.chat_models import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
import json
import re

In [70]:
from dotenv import load_dotenv
import os
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")


In [74]:
# LLM instance (adjust model or config as needed)
llm = ChatOpenAI(model="gpt-4", temperature=0, )

In [81]:
llm = ChatGoogleGenerativeAI( model="gemini-1.5-flash", temperature=0, api_key="AIzaSyDVQ-YCLCNCfwPIdvrYAObVMIkYT0zFaFE")

In [None]:
def analyze_code(state: CodeState):
    file_map = state['file_map']
    combined_code = "\n\n".join(
        f"# File: {filename}\n{content}" for filename, content in file_map.items()
    )
    
    prompt = f"""
            You are a code modernization expert.

            You will receive a dictionary of code files. Analyze all files together.

            Your job is to classify the codebase into **one** of the following categories:
            - "Upgrade": The code works but would benefit from updates such as using a newer version of a language, improving style, adding types, or using newer APIs.
            - "Rewrite": The code is outdated, poorly structured, or written in a language/framework that is no longer suitable. It should be rewritten entirely.
            - "No change": The code is modern, clean, and follows current best practices.

            Return output in this JSON format:
            {{
            "analysis": "Upgrade" | "Rewrite" | "No change",
            "feedback": "A brief explanation of what needs improvement or why no changes are needed."
            }}

            Analyze this code base:
            {combined_code}
    """
    response = llm.invoke([HumanMessage(content=prompt)])
    content = response.content
    # Case 1: Gemini (JSON inside ```json code block```)
    if content.startswith("```json"):
        try:
            match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL)
            if match:
                parsed = json.loads(match.group(1))
                result = parsed.get("analysis", "").capitalize()
                if result not in ("Upgrade", "Rewrite", "No change"):
                    raise ValueError("Invalid 'analysis' value.")
                return {"result": result, "feedback": parsed.get("feedback", "").strip()}
        except Exception as e:
            return {"result": "Rewrite", "feedback": f"Failed to parse Gemini output: {e}"}

    # Case 2: OpenAI or raw JSON (starts directly with `{`)
    elif content.startswith("{"):
        try:
            parsed = json.loads(content)
            result = parsed.get("analysis", "").capitalize()
            if result not in ("Upgrade", "Rewrite", "No change"):
                raise ValueError("Invalid 'analysis' value.")
            return {"result": result, "feedback": parsed.get("feedback", "").strip()}
        except Exception as e:
            return {"result": "Rewrite", "feedback": f"Failed to parse OpenAI output: {e}"}

    # Fallback
    return {"result": "Rewrite", "feedback": "Unrecognized format or parsing failed."}


In [87]:
file_map = extract_to_file_map("../cobal")

# Step 2: Call analyze_code node
result = analyze_code({"file_map": file_map})
print(result)

Scanning directory: ..\cobal
Reading file: main.cob
Reading file: math_utils.cob
{'result': 'Rewrite', 'feedback': 'The codebase is written in COBOL, a legacy programming language.  Maintaining and extending COBOL code is increasingly difficult due to a shrinking pool of skilled developers and a lack of modern tooling and libraries.  A rewrite in a modern language like Python, Java, or C# is recommended to improve maintainability, scalability, and developer productivity.  The functionality is simple, but the underlying technology is severely outdated.'}
