In [None]:

import json
import os
from google import genai

# Set your API key as environment variable
os.environ["GOOGLE_API_KEY"] = ""

# Initialize Gemini client
client = genai.Client()


def extract_code_from_notebook(notebook_file: str) -> str:
    """Extract only code cells from a Jupyter notebook (.ipynb)."""
    with open(notebook_file, "r", encoding="utf-8") as f:
        notebook_json = json.load(f)

    code_cells = []
    for cell in notebook_json.get("cells", []):
        if cell.get("cell_type") == "code":
            code_text = "".join(cell.get("source", []))
            if code_text.strip():
                code_cells.append(code_text)

    return "\n\n".join(code_cells)


def analyze_notebook_with_gemini(prompt: str, notebook_file: str, model: str = "gemini-2.0-flash"):
    """Send prompt + extracted notebook code to Gemini and return response."""
    try:
        code_content = extract_code_from_notebook(notebook_file)
    except FileNotFoundError:
        return f"Notebook {notebook_file} not found."

    full_prompt = f"""
You are analyzing the notebook: {notebook_file}.

Here are the extracted Python code cells:
{code_content}

User task:
{prompt}
"""

    # Create a chat with Gemini
    chat = client.chats.create(model=model)

    # Send the message
    response = chat.send_message(full_prompt)

    return response.text


# Enter your prompt here:
my_prompt = """
"""

my_notebook = "" #Enter the notebook location here

output = analyze_notebook_with_gemini(my_prompt, my_notebook)
print(output)




Data Wrangling Step	Technique Used	Details
Check for balanced data	No	No explicit balance check performed
Sampling type	Random	train_test_split without stratify
Outliers removal	No	No explicit outlier removal
Check for duplicates	No	No duplicate check performed
Imputation of missing values	ignore	Missing values are present, but no explicit handling is performed
Drop columns	No	'target' column is dropped only to define target/features
Encoding	mixture of encoding	Object columns are converted to categorical type, then OrdinalEncoder is used
Create new columns	No	'Xv_enc' is not used later for training and prediction
Feature selection	Yes	Features are dropped based on model importance
Data scaling/standardisation	No	No standard scaling or other scaling performed
Hyperparameter tuning	Yes	Optuna is used via lgb.train to select parameters

