In [None]:

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

print(" Starting Schema Detector Notebook...")


# Check if the notebook is being run after Notebook 01
try:
    df  # Check if df already exists
    print(" Using DataFrame from previous notebook.")
    current_df = df
except NameError:
    print(" DataFrame not found. Loading from CSV as fallback...")
    
    # Fallback: load CSV again
    fallback_path = "../data/sample.csv"
    current_df = pd.read_csv(fallback_path)
    print(f" Loaded fallback dataset from: {fallback_path}")


current_df.head()


 Starting Schema Detector Notebook...
 DataFrame not found. Loading from CSV as fallback...
 Loaded fallback dataset from: ../data/sample.csv


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [9]:
# Detect numeric, categorical, boolean, and datetime columns (fixed logic)

def detect_column_types(df):
    """
    Automatically detects schema types in the dataset.
    Returns a dictionary of column categories.
    """

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    boolean_cols = df.select_dtypes(include=['bool']).columns.tolist()

    # Datetime detection ONLY for object/string columns
    datetime_cols = []
    for col in categorical_cols:
        try:
            parsed = pd.to_datetime(df[col], errors='raise')
            datetime_cols.append(col)
        except:
            pass  # Not datetime

    # Remove datetime columns from categorical list
    categorical_cols = [c for c in categorical_cols if c not in datetime_cols]

    schema = {
        "numeric": numeric_cols,
        "categorical": categorical_cols,
        "boolean": boolean_cols,
        "datetime": datetime_cols
    }

    print(" Corrected Column Type Detection:")
    print(schema)

    return schema


# Run schema detection again
schema = detect_column_types(current_df)



 Corrected Column Type Detection:
{'numeric': ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], 'categorical': ['Species'], 'boolean': [], 'datetime': []}


In [10]:
# Detect ID-like columns (not useful just causes overfitting , ig treating them like class roll no might help imo , doesnt tell shit about student)

def detect_id_columns(df):
    """
    Detects columns that behave like ID columns:
    - unique for each row
    - or name contains 'id'
    """

    n_rows = len(df)

    id_candidates = []

    for col in df.columns:
        # Rule 1: Unique for every row â†’ strong ID
        if df[col].nunique() == n_rows:
            id_candidates.append(col)
            continue
        
        # Rule 2: Column name contains 'id'
        if "id" in col.lower():
            id_candidates.append(col)

    print(" ID-like Columns Detected:", id_candidates)
    return id_candidates


# Run ID detection
id_cols = detect_id_columns(current_df)


 ID-like Columns Detected: ['Id', 'SepalWidthCm', 'PetalWidthCm']


In [11]:
# Detect potential target column (mostly one with less <30 unique values or last columm -- like in mcq exams u guess it base unique options)

def detect_target_column(df, id_cols):
    """
    Automatically identifies the most likely target column.
    """

    target_candidates = []

    for col in df.columns:

        # Skip ID columns (never targets)
        if col in id_cols:
            continue

        # Rule 1: If unique values are small, good for classification
        if df[col].nunique() <= 30 and df[col].dtype == 'object':
            target_candidates.append(col)

    # Rule 2: fallback - last column
    if len(target_candidates) == 0:
        target_candidates.append(df.columns[-1])

    print(" Possible Target Columns:", target_candidates)
    return target_candidates


# Run target detection
target_cols = detect_target_column(current_df, id_cols)


 Possible Target Columns: ['Species']


In [12]:
# Detect low-variance columns (cols with 99% same values , so just remove them cause nothing to learn)

def detect_low_variance_columns(df, threshold=0.99):
    """
    Identifies columns where one value dominates (low variance).
    threshold = percentage of most common value allowed.
    """

    low_var_cols = []

    for col in df.columns:
        top_frequency = df[col].value_counts(normalize=True).max()
        
        if top_frequency >= threshold:
            low_var_cols.append(col)

    print(f" Low-variance columns (>{threshold*100}% same value): {low_var_cols}")
    return low_var_cols


# Run low-variance detection
low_var_cols = detect_low_variance_columns(current_df)


 Low-variance columns (>99.0% same value): []


In [13]:
# Combine everything into one final schema dictionary

def build_schema_object(df, schema, id_cols, target_cols, low_var_cols):
    """
    Creates a unified schema dictionary containing all detected metadata.
    """

    final_schema = {
        "numeric_columns": schema["numeric"],
        "categorical_columns": schema["categorical"],
        "boolean_columns": schema["boolean"],
        "datetime_columns": schema["datetime"],
        "id_columns": id_cols,
        "target_candidates": target_cols,
        "low_variance_columns": low_var_cols,
        "n_rows": df.shape[0],
        "n_columns": df.shape[1]
    }

    print(" Final Schema Object:")
    for key, value in final_schema.items():
        print(f"{key}: {value}")

    return final_schema


# Build final schema using all detection functions
final_schema = build_schema_object(
    current_df,
    schema,
    id_cols,
    target_cols,
    low_var_cols
)


 Final Schema Object:
numeric_columns: ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
categorical_columns: ['Species']
boolean_columns: []
datetime_columns: []
id_columns: ['Id', 'SepalWidthCm', 'PetalWidthCm']
target_candidates: ['Species']
low_variance_columns: []
n_rows: 150
n_columns: 6


In [14]:
# Master function to run full schema detection pipeline

def generate_schema(df):
    """
    Runs the complete schema detection pipeline and returns a full schema dictionary.
    """

    print(" Step 1: Detecting basic column types...")
    schema = detect_column_types(df)

    print("\n Step 2: Detecting ID-like columns...")
    id_cols = detect_id_columns(df)

    print("\n Step 3: Detecting target column(s)...")
    target_cols = detect_target_column(df, id_cols)

    print("\n Step 4: Detecting low-variance columns...")
    low_var_cols = detect_low_variance_columns(df)

    print("\n Step 5: Building final schema object...")
    final_schema = build_schema_object(df, schema, id_cols, target_cols, low_var_cols)

    print("\n Schema Detection Complete!")
    return final_schema


# Run full schema pipeline on the dataset
full_schema = generate_schema(current_df)
full_schema


 Step 1: Detecting basic column types...
 Corrected Column Type Detection:
{'numeric': ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], 'categorical': ['Species'], 'boolean': [], 'datetime': []}

 Step 2: Detecting ID-like columns...
 ID-like Columns Detected: ['Id', 'SepalWidthCm', 'PetalWidthCm']

 Step 3: Detecting target column(s)...
 Possible Target Columns: ['Species']

 Step 4: Detecting low-variance columns...
 Low-variance columns (>99.0% same value): []

 Step 5: Building final schema object...
 Final Schema Object:
numeric_columns: ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
categorical_columns: ['Species']
boolean_columns: []
datetime_columns: []
id_columns: ['Id', 'SepalWidthCm', 'PetalWidthCm']
target_candidates: ['Species']
low_variance_columns: []
n_rows: 150
n_columns: 6

 Schema Detection Complete!


{'numeric_columns': ['Id',
  'SepalLengthCm',
  'SepalWidthCm',
  'PetalLengthCm',
  'PetalWidthCm'],
 'categorical_columns': ['Species'],
 'boolean_columns': [],
 'datetime_columns': [],
 'id_columns': ['Id', 'SepalWidthCm', 'PetalWidthCm'],
 'target_candidates': ['Species'],
 'low_variance_columns': [],
 'n_rows': 150,
 'n_columns': 6}