<a href="https://colab.research.google.com/github/Bhawana874/LLM-based-Data-Documentation-Generator/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip -q install pandas numpy python-dotenv gradio transformers accelerate sentencepiece




[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.1/400.1 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.7/679.7 kB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.4/105.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m3.1 MB/s[0m eta [36

In [5]:
# =========================
# 1) Imports & Utilities
# =========================
import os, re, json, textwrap, hashlib
import pandas as pd
import numpy as np
from io import StringIO
from datetime import datetime
from typing import Dict, Any, List

# Optional: richer profiling summary (we'll cherry-pick a few safe stats)
from ydata_profiling import ProfileReport

# LLM (local fallback)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# UI
import gradio as gr

# For Colab file dialogs
try:
    from google.colab import files  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

print("Colab detected:", IN_COLAB)


Colab detected: True


In [6]:
# =========================
# 3) Lightweight PII Detection Helpers
# =========================

EMAIL_RE   = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
PHONE_RE   = re.compile(r"\b(\+?\d{1,3}[-.\s]?)?\d{10}\b")
AADHAAR_RE = re.compile(r"\b\d{4}\s?\d{4}\s?\d{4}\b")         # simple heuristic
PAN_RE     = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b")
IP_RE      = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")

def detect_pii_series(s: pd.Series, sample_n: int = 200) -> Dict[str, bool]:
    """Heuristic PII scan on a sample of values in a column."""
    flags = {"email": False, "phone": False, "aadhaar": False, "pan": False, "ip": False}
    if s.dtype == object:
        sample_vals = s.dropna().astype(str).head(sample_n)
        text = "\n".join(sample_vals)
        flags["email"] = bool(EMAIL_RE.search(text))
        flags["phone"] = bool(PHONE_RE.search(text))
        flags["aadhaar"] = bool(AADHAAR_RE.search(text))
        flags["pan"]    = bool(PAN_RE.search(text))
        flags["ip"]     = bool(IP_RE.search(text))
    return flags

def dtype_human(d: Any) -> str:
    if pd.api.types.is_integer_dtype(d): return "integer"
    if pd.api.types.is_float_dtype(d):   return "float"
    if pd.api.types.is_bool_dtype(d):    return "boolean"
    if pd.api.types.is_datetime64_any_dtype(d): return "datetime"
    return "string"


In [7]:
# =========================
# 4) Dataset Loaders
# =========================

def load_csv_interactive() -> pd.DataFrame:
    if not IN_COLAB:
        raise RuntimeError("Interactive upload only works in Colab. Set IN_COLAB=True manually if needed.")
    print("Choose a CSV file...")
    uploaded = files.upload()
    if not uploaded:
        raise RuntimeError("No file uploaded")
    fname = list(uploaded.keys())[0]
    df = pd.read_csv(fname)
    print("Loaded:", fname, "shape:", df.shape)
    return df

def load_csv_path(path: str) -> pd.DataFrame:
    return pd.read_csv(path)

# You can extend this to DBs (Postgres, Snowflake) via SQLAlchemy if needed.


In [8]:
# =========================
# 5) Schema & Quality Summary
# =========================

def summarize_dataframe(df: pd.DataFrame, sample_rows: int = 5) -> Dict[str, Any]:
    summary = {
        "n_rows": int(df.shape[0]),
        "n_cols": int(df.shape[1]),
        "columns": []
    }
    for col in df.columns:
        s = df[col]
        info = {
            "name": col,
            "dtype": dtype_human(s.dtype),
            "null_pct": float(s.isna().mean() * 100),
            "unique_pct": float(s.nunique(dropna=True) / max(len(s),1) * 100),
            "example_values": [str(v) for v in s.dropna().unique()[:sample_rows]],
            "pii_flags": detect_pii_series(s),
        }
        # basic stats for numeric
        if pd.api.types.is_numeric_dtype(s):
            desc = s.describe()
            info["min"] = float(desc.get("min", np.nan)) if "min" in desc else None
            info["max"] = float(desc.get("max", np.nan)) if "max" in desc else None
            info["mean"] = float(desc.get("mean", np.nan)) if "mean" in desc else None
        summary["columns"].append(info)
    return summary

def generate_profile_html(df: pd.DataFrame, out_html="profile_report.html"):
    # optional heavy report for your own review
    profile = ProfileReport(df, title="Data Profile", minimal=True)
    profile.to_file(out_html)
    return out_html


In [12]:
# =========================
# 6) LLM Wrappers
# =========================

SYSTEM_INSTRUCTIONS = """You are an expert data documentation assistant.
You produce concise, crystal-clear README documentation for tabular datasets used by data teams.
Use plain language, avoid hype, and include concrete details from the provided schema summary."""

def prompt_for_dataset_docs(dataset_name: str, summary: Dict[str, Any]) -> str:
    """Compose a prompt for the LLM."""
    schema_json = json.dumps(summary, indent=2)
    user_prompt = f"""
Dataset name: {dataset_name}

You are given a JSON schema/quality summary of a tabular dataset.
Write a professional README-style documentation with these sections:

1) Overview (what the dataset represents; typical use cases)
2) Data Fields (field-by-field bullet points: meaning, type, units if any, common ranges, PII flags if detected)
3) Data Quality (null %, uniqueness, basic issues like outliers or schema drift risk)
4) Example Queries / Usage (2-3 practical examples in SQL or pandas)
5) Compliance & Privacy (call out columns that may contain PII and suggest masking/anonymization)
6) Change Management (how to track versioning/lineage; tips for stable downstream use)

Constraints:
- Be accurate and non-speculative; if unknown, say 'not specified'.
- Keep it under ~600 words.
- Keep a crisp, friendly tone for data engineers/analysts.

Schema summary JSON:
{schema_json}
"""
    return user_prompt.strip()

def run_llm(prompt: str) -> str:
    global USE_OPENAI
    if USE_OPENAI:
        resp = openai_client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[
                {"role": "system", "content": SYSTEM_INSTRUCTIONS},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
        )
        return resp.choices[0].message.content.strip()
    else:
        # Local FLAN-T5 fallback
        max_in = 1024
        tokens = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_in)
        output = model.generate(
            **tokens,
            max_new_tokens=512,
            temperature=0.2,
            do_sample=False
        )
        text = tokenizer.decode(output[0], skip_special_tokens=True)
        return text.strip()



In [10]:
# =========================
# 7) Markdown Renderer & Saver
# =========================

def render_markdown(dataset_name: str, body: str) -> str:
    header = f"# {dataset_name} — Dataset Documentation\n\n_Last generated: {datetime.utcnow().isoformat()}Z_\n\n"
    return header + body + "\n"

def save_markdown(md_text: str, fname: str = "README.md") -> str:
    with open(fname, "w", encoding="utf-8") as f:
        f.write(md_text)
    print("Saved:", fname)
    return fname


In [14]:
# =========================
# Configuration
# =========================

# Set to True to use OpenAI (requires OPENAI_API_KEY set in Colab secrets)
USE_OPENAI = False

# If using OpenAI, specify the model
OPENAI_MODEL = "gpt-4o-mini" # or gpt-3.5-turbo, etc.

# If using local LLM, load the model and tokenizer
if not USE_OPENAI:
    print("Loading local LLM (FLAN-T5)...")
    try:
        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
        model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
        print("Local LLM loaded.")
    except Exception as e:
        print(f"Error loading local LLM: {e}")
        print("Please ensure you have enough memory (consider a high-RAM Colab instance).")
        # Fallback or error handling might be needed here
        tokenizer = None
        model = None

Loading local LLM (FLAN-T5)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Local LLM loaded.


In [15]:
# =========================
# 8) End-to-End: Upload → Profile → Docs
# =========================

# A. Try a sample dataset if you don't have one
sample_csv = """customer_id,signup_date,churn_flag,email,plan,monthly_fee
C001,2023-01-05,0,alice@example.com,Basic,9.99
C002,2023-02-10,1,bob@example.com,Pro,19.99
C003,2023-02-12,0,charlie@example.com,Basic,9.99
C004,2023-03-01,0,diana@example.com,Enterprise,49.00
C005,2023-03-15,1,ed@example.com,Pro,19.99
"""

df = pd.read_csv(StringIO(sample_csv))
print("Sample df shape:", df.shape)
display(df.head())

# B. Replace with your own upload if you want
# df = load_csv_interactive()

dataset_name = "Customer_Subscriptions"
summary = summarize_dataframe(df)
doc_prompt = prompt_for_dataset_docs(dataset_name, summary)
doc_text = run_llm(doc_prompt)
md = render_markdown(dataset_name, doc_text)
print(md[:800], "...\n")  # preview

save_markdown(md, "README.md")

# Optional: full profile (HTML) for your own review
# generate_profile_html(df, "profile_report.html")


Sample df shape: (5, 6)


Unnamed: 0,customer_id,signup_date,churn_flag,email,plan,monthly_fee
0,C001,2023-01-05,0,alice@example.com,Basic,9.99
1,C002,2023-02-10,1,bob@example.com,Pro,19.99
2,C003,2023-02-12,0,charlie@example.com,Basic,9.99
3,C004,2023-03-01,0,diana@example.com,Enterprise,49.0
4,C005,2023-03-15,1,ed@example.com,Pro,19.99


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


# Customer_Subscriptions — Dataset Documentation

_Last generated: 2025-08-24T13:53:58.535200Z_


 ...

Saved: README.md


  header = f"# {dataset_name} — Dataset Documentation\n\n_Last generated: {datetime.utcnow().isoformat()}Z_\n\n"


'README.md'

In [16]:
# =========================
# 9) Gradio App: Upload → Auto Doc
# =========================

def generate_docs_from_csv(file_obj) -> str:
    try:
        df = pd.read_csv(file_obj.name)
    except Exception:
        # Fallback: some browsers give bytes-like object
        file_obj.seek(0)
        df = pd.read_csv(file_obj)
    dataset_name = os.path.splitext(os.path.basename(getattr(file_obj, "name", "Uploaded_Dataset")))[0]
    summary = summarize_dataframe(df)
    prompt = prompt_for_dataset_docs(dataset_name, summary)
    body = run_llm(prompt)
    md = render_markdown(dataset_name, body)
    # Persist
    digest = hashlib.sha256(md.encode("utf-8")).hexdigest()[:8]
    fname = f"README_{dataset_name}_{digest}.md"
    save_markdown(md, fname)
    return md

with gr.Blocks() as demo:
    gr.Markdown("# LLM-based Data Documentation Generator")
    with gr.Row():
        file = gr.File(label="Upload CSV")
    btn = gr.Button("Generate Documentation")
    out = gr.Markdown(label="README Preview")
    btn.click(fn=generate_docs_from_csv, inputs=file, outputs=out)

demo.launch(debug=False)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6258ce521cb7c41607.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


