In [1]:

import json
import os
from google import genai

# Set your API key as environment variable
os.environ["GOOGLE_API_KEY"] = "AIzaSyBw5KldQIHCbwbhk42h-f20qyVMLSUIQvQ"

# Initialize Gemini client
client = genai.Client()


def extract_code_from_notebook(notebook_file: str) -> str:
    """Extract only code cells from a Jupyter notebook (.ipynb)."""
    with open(notebook_file, "r", encoding="utf-8") as f:
        notebook_json = json.load(f)

    code_cells = []
    for cell in notebook_json.get("cells", []):
        if cell.get("cell_type") == "code":
            code_text = "".join(cell.get("source", []))
            if code_text.strip():
                code_cells.append(code_text)

    return "\n\n".join(code_cells)


def analyze_notebook_with_gemini(prompt: str, notebook_file: str, model: str = "gemini-2.0-flash"):
    """Send prompt + extracted notebook code to Gemini and return response."""
    try:
        code_content = extract_code_from_notebook(notebook_file)
    except FileNotFoundError:
        return f"Notebook {notebook_file} not found."

    full_prompt = f"""
You are analyzing the notebook: {notebook_file}.

Here are the extracted Python code cells:
{code_content}

User task:
{prompt}
"""

    # Create a chat with Gemini
    chat = client.chats.create(model=model)

    # Send the message
    response = chat.send_message(full_prompt)

    return response.text


# Example usage:
my_prompt = """ROLE: You are a meticulous code auditor. The extracted Python code from the notebook appears ABOVE. 

TASK: Output ONE tab-separated table ONLY with columns:
Data Wrangling Step<TAB>Technique Used<TAB>Details
No extra text.

ALLOWED LABELS (use exactly):
- Check for balanced data: Yes / No
- Sampling type: Random / Stratified / Oversampling / None
- Outliers removal: Yes / No
- Check for duplicates: Yes / No
- Imputation of missing values: ignore / replace with text / use summary statistics / mixture of imputation techniques / multivariate / drop the missing value rows / drop the missing value columns / none
- Drop columns: Yes / No
- Encoding: Label Encoder / One hot encoding / catboost / mixture of encoding / dummy / none
- Create new columns: Yes / No
- Feature selection: Yes / No
- Data scaling/standardisation: Yes / No
- Hyperparameter tuning: Yes / No

CRITICAL RULES:
1) Create new columns = No when columns are derived from existing ones (pd.get_dummies/OneHotEncoder/arithmetic/parsing/type-casts/renames). Mark Yes only if truly new info not previously present.
2) Drop columns = Yes only if a column is removed WITHOUT replacement (its information is not used to create other columns). If used to derive features and then dropped → Drop columns = No.
3) Columns dropped AFTER EDA/visualisation/correlation/model-importance count as Feature selection = Yes (not Drop columns). Using correlation to prune features is feature selection.
4) Log transforms are NOT scaling. Scaling = StandardScaler/MinMax/Robust/etc.
5) Hyperparameter tuning = Yes only for systematic search (GridSearchCV/RandomizedSearchCV/Optuna/Bayesian). Manually setting parameters ≠ tuning.
6) Sampling: 
   - train_test_split without stratify → Random
   - train_test_split(..., stratify=target) or StratifiedKFold → Stratified
   - SMOTE/undersampling/etc. → Oversampling
   - none → None
7) If multiple attempts appear, report the LAST operation that is actually used downstream.

FEW-SHOT EXAMPLE (format to follow):
Example Code:

y = df["target"]
import seaborn as sns; sns.countplot(x=y)                # balance check
X = df.drop(columns=["target"])
X["amt_log"] = np.log1p(X["amt"])                        # log ≠ scaling
X = pd.get_dummies(X, columns=["cat"])                   # derived dummies
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y)          # Random
# Post-EDA pruning:
X_tr = X_tr.drop(columns=["highly_correlated_col"])      # feature selection

Expected TSV:
Data Wrangling Step	TTechnique Used	Details
Check for balanced data	Yes	countplot on target
Sampling type	Random	train_test_split without stratify
Outliers removal	No	No explicit filtering in code
Check for duplicates	No	No duplicated/drop_duplicates used
Imputation of missing values	none	No fillna/imputer/dropna
Drop columns	No	Original 'cat' removed only via get_dummies → replacement
Encoding	One hot encoding	pd.get_dummies on 'cat'
Create new columns	No	Derived (log, dummies) per rule
Feature selection	Yes	Post-EDA drop of 'highly_correlated_col'
Data scaling/standardisation	No	No scaler used; log ≠ scaling
Hyperparameter tuning	No	No Grid/Random/Optuna

OUTPUT ORDER (exactly these 11 rows):
Check for balanced data
Sampling type
Outliers removal
Check for duplicates
Imputation of missing values
Drop columns
Encoding
Create new columns
Feature selection
Data scaling/standardisation
Hyperparameter tuning

SELF-CORRECTION (do once, silently): Before output, ensure you did NOT mark
- Create new columns = Yes for derived/log/dummies/parsed/renamed,
- Drop columns = Yes when the column’s info was used elsewhere.
If violated, fix and output only the corrected TSV.



"""

my_notebook = "Dataset 4 Recipe 7.ipynb"

output = analyze_notebook_with_gemini(my_prompt, my_notebook)
print(output)




Data Wrangling Step	Technique Used	Details
Check for balanced data	No	No balance check performed
Sampling type	Random	train_test_split without stratify
Outliers removal	No	Outliers visualized, but not removed
Check for duplicates	No	No duplicate check performed
Imputation of missing values	none	No missing values handling
Drop columns	Yes	'No' column dropped without replacement
Encoding	none	No encoding performed
Create new columns	No	No new columns created
Feature selection	No	No feature selection performed
Data scaling/standardisation	Yes	StandardScaler used in pipeline
Hyperparameter tuning	No	No hyperparameter tuning performed

