In [None]:
# Cell 2: Imports
import os, json, re, warnings
from collections import defaultdict, Counter
from pathlib import Path

import pandas as pd
import numpy as np
from pypdf import PdfReader
import tabula

# ML & XAI
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from scipy.stats import chi2_contingency, spearmanr

import shap
from lime.lime_tabular import LimeTabularExplainer

# UI
from IPython.display import display, Markdown, HTML, clear_output
import ipywidgets as widgets

warnings.filterwarnings('ignore')
print("‚úì Imports complete")

‚úì Imports complete


In [None]:
# Cell 3: Initialize Groq client

import os
groq_api_key = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY.strip():
    raise ValueError("‚ùå ERROR: Groq API key cannot be empty")

from groq import Groq
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
client = Groq(api_key=GROQ_API_KEY)

# Choose best available model
def choose_groq_model(client):
    candidates = ["llama-3.3-70b-versatile", "llama-3.2-90b-vision-preview", "llama-3.1-8b-instant"]
    try:
        available = [m["id"] for m in client.models.list()]
        for c in candidates:
            if c in available:
                return c
        return available[0] if available else candidates[0]
    except:
        return candidates[0]

groq_model = choose_groq_model(client)
print(f"‚úì Groq client initialized (model: {groq_model})")

‚úì Groq client initialized (model: llama-3.3-70b-versatile)


In [None]:
# Cell 4: Upload Dataset (VS Code version)
import ipywidgets as widgets
from IPython.display import display, clear_output
import io

upload_widget = widgets.FileUpload(
    accept='.csv,.xlsx,.xls',
    multiple=False,
    description='Upload Dataset'
)

output = widgets.Output()

def on_upload(change):
    with output:
        clear_output()
        if not upload_widget.value:
            print("‚ùå No file uploaded")
            return

        # Get uploaded file
        uploaded_file = list(upload_widget.value.values())[0]
        filename = list(upload_widget.value.keys())[0]
        content = uploaded_file['content']

        # Load into DataFrame
        global df_raw, dataset_path

        if filename.lower().endswith('.csv'):
            df_raw = pd.read_csv(io.BytesIO(content))
        else:
            df_raw = pd.read_excel(io.BytesIO(content), sheet_name=0)

        dataset_path = filename

        print(f"‚úì Dataset uploaded: {filename}")
        print(f"‚úì Loaded: {df_raw.shape[0]} rows, {df_raw.shape[1]} columns")
        print(f"Columns: {df_raw.columns.tolist()}")
        display(df_raw.head())

upload_widget.observe(on_upload, names='value')

display(widgets.VBox([upload_widget, output]))


ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Cell 5: Identify ID Columns
def identify_id_columns(df):
    """
    Automatically identify ID columns that should not be predicted.

    ID columns typically:
    - Have 'id' in the name (case insensitive)
    - Have unique or nearly unique values
    - Are numeric sequences
    """
    id_columns = []

    for col in df.columns:
        # Check 1: Name contains 'id'
        if 'id' in col.lower():
            id_columns.append(col)
            continue

        # Check 2: High uniqueness (>95% unique values)
        if len(df) > 0:
            uniqueness = df[col].nunique() / len(df)
            if uniqueness > 0.95:
                id_columns.append(col)
                continue

        # Check 3: Sequential numeric values
        if df[col].dtype in [np.int64, np.int32]:
            non_null = df[col].dropna()
            if len(non_null) > 1:
                # Check if values are sequential
                diffs = non_null.diff().dropna()
                if len(diffs) > 0 and (diffs == 1).sum() / len(diffs) > 0.9:
                    id_columns.append(col)

    return id_columns

id_columns = identify_id_columns(df_raw)

print("üîç Identified ID columns (will be excluded from predictions):")
if id_columns:
    for col in id_columns:
        print(f"  - {col}")
else:
    print("  - None found")

# Get non-ID columns for analysis
non_id_columns = [col for col in df_raw.columns if col not in id_columns]
print(f"\n‚úì {len(non_id_columns)} columns available for prediction")

üîç Identified ID columns (will be excluded from predictions):
  - CustomerID

‚úì 7 columns available for prediction


In [None]:
# Cell 6: Upload Rules Document (REQUIRED for outlier detection)
print("üìÑ Upload rules document (PDF/CSV/XLSX/TXT)")
print("This is REQUIRED for outlier detection (min/max, allowed values)\n")

rules_uploaded = files.upload()
rules_doc_path = list(rules_uploaded.keys())[0] if rules_uploaded else None

if rules_doc_path:
    print(f"‚úì Rules document uploaded: {rules_doc_path}")
else:
    print("‚ö†Ô∏è No rules document - outlier detection will be skipped")
    print("Only missing values will be detected")

üìÑ Upload rules document (PDF/CSV/XLSX/TXT)
This is REQUIRED for outlier detection (min/max, allowed values)



Saving limitations.csv to limitations.csv
‚úì Rules document uploaded: limitations.csv


In [None]:
# Cell 7: Process Rules Document
def extract_rules_from_file(rules_doc_path, columns):
    """
    Extract rules from uploaded file.
    First try to parse as CSV, if fails, use LLM extraction.
    """
    import tabula
    from io import StringIO
    
    # Try to read as CSV first
    try:
        if rules_doc_path.lower().endswith('.pdf'):
            # Use tabula for PDF
            tables = tabula.read_pdf(rules_doc_path, pages='all', multiple_tables=True)
            if tables:
                rules_df = tables[0]
            else:
                raise ValueError("No tables found in PDF")
        else:
            # Try reading as CSV
            rules_df = pd.read_csv(rules_doc_path, on_bad_lines='skip')
        
        # Check if it has expected columns
        expected_cols = ['Column', 'dtype', 'min', 'max', 'allowed_values', 'constraints']
        if len(rules_df.columns) >= 6 and all(col in rules_df.columns for col in expected_cols[:len(rules_df.columns)]):
            print("‚úì Parsed rules as direct CSV file")
            return rules_df
    except Exception as e:
        print(f"Direct CSV parsing failed: {e}")
    
    # Fallback to LLM extraction
    print("Extracting rules using LLM...")
    
    # Read document text
    if rules_doc_path.lower().endswith('.pdf'):
        from pypdf import PdfReader
        reader = PdfReader(rules_doc_path)
        doc_text = ""
        for page in reader.pages:
            doc_text += page.extract_text() + "\n"
    else:
        with open(rules_doc_path, 'r', encoding='utf-8', errors='ignore') as f:
            doc_text = f.read()
    
    if not doc_text.strip():
        print("‚ö†Ô∏è No text found in document")
        return None
    
    prompt = build_extraction_prompt(doc_text, columns)
    
    try:
        resp = client.chat.completions.create(
            model=groq_model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_tokens=2000
        )
        csv_text = resp.choices[0].message.content.strip()
        
        # Parse CSV response
        import csv
        lines = csv_text.split('\n')
        data = []
        for line in lines:
            if line.strip():
                reader = csv.reader([line], delimiter=',')
                try:
                    row = next(reader)
                    data.append(row[:6] + [''] * (6 - len(row)))  # Pad to 6 fields
                except:
                    continue
        
        if data:
            rules_df = pd.DataFrame(data, columns=['Column', 'dtype', 'min', 'max', 'allowed_values', 'constraints'])
            print("‚úì Extracted rules using LLM")
            return rules_df
        else:
            print("‚ö†Ô∏è No rules extracted")
            return None
            
    except Exception as e:
        print(f"‚ö†Ô∏è Rule extraction failed: {e}")
        return None

# Process the uploaded rules document
rules_df = None
if rules_doc_path:
    rules_df = extract_rules_from_file(rules_doc_path, non_id_columns)
    
    if rules_df is not None:
        print(f"‚úì Rules extracted: {len(rules_df)} rules")
        display(rules_df)
        
        # Save for download
        rules_df.to_csv('extracted_rules.csv', index=False)
    else:
        print("‚ö†Ô∏è No rules extracted - proceeding with statistical outlier detection only")
else:
    print("‚ö†Ô∏è No rules document - proceeding with statistical outlier detection only")

In [None]:
def build_extraction_prompt(doc_text, columns):
    """Build prompt for LLM to extract rules"""
    return f"""Extract data validation rules from this document and output ONLY a CSV table with one row per column.

Dataset columns: {', '.join(columns)}

Document:
{doc_text[:3000]}

Output CSV format (no other text, no headers, just data):
Column,dtype,min,max,allowed_values,constraints

Instructions:
- Column: exact column name from the dataset columns list above
- dtype: 'integer', 'float', 'categorical', 'text', or 'boolean'
- min: numeric minimum value for integer/float columns (leave blank for categorical/text/boolean)
- max: numeric maximum value for integer/float columns (leave blank for categorical/text/boolean)
- allowed_values: pipe-separated list of valid values for categorical columns only, like "Male|Female|Other" (leave blank for numeric columns)
- constraints: additional constraints for numeric columns, like "positive", "negative", "non-negative", "non-positive" (leave blank if none)

Examples:
- For a numeric column "Age": Age,integer,0,120,,positive
- For a categorical column "Gender": Gender,categorical,,,,Male|Female|Other
- For a text column with no rules: Description,text,,,,,

Only include columns that have rules mentioned in the document. Leave fields blank if not specified.
"""

Extracting rules from document...
‚úì Extracted 8 rules


Unnamed: 0,Column,dtype,min,max,allowed_values
0,CustomerID,,,,
1,Gender,categorical,,,Male|Female|Other
2,Age,integer,18.0,70.0,
3,Annual Income ($),float,10000.0,300000.0,
4,Spending Score (1-100),float,1.0,100.0,
5,Profession,categorical,,,Engineer|Doctor|Lawyer|Artist|Other
6,Work Experience,integer,0.0,50.0,
7,Family Size,integer,1.0,10.0,


üíæ Saved to extracted_rules.csv


In [None]:
# Cell 8: Analyze Column Dependencies (excluding ID columns)
def analyze_column_dependencies(df, non_id_cols, threshold=0.1):
    """
    Find which columns depend on which other columns.
    Only analyzes non-ID columns.

    Uses statistical methods:
    - Mutual Information (numeric-categorical)
    - Chi-Square Test (categorical-categorical)
    - Spearman Correlation (numeric-numeric)

    Returns: dict where key=target_column, value=list of dependent columns
    """
    dependencies = {}

    for target_col in non_id_cols:
        dependent_cols = []

        # Prepare target
        target_data = df[target_col].dropna()
        if len(target_data) < 10:
            dependencies[target_col] = []
            continue

        # Check each potential predictor column (also non-ID)
        for predictor_col in non_id_cols:
            if predictor_col == target_col:
                continue

            # Get valid rows for both columns
            valid_idx = df[[target_col, predictor_col]].dropna().index
            if len(valid_idx) < 10:
                continue

            X = df.loc[valid_idx, predictor_col]
            y = df.loc[valid_idx, target_col]

            try:
                # Calculate dependency score
                if df[target_col].dtype == 'object':
                    # Categorical target
                    if df[predictor_col].dtype == 'object':
                        # Chi-square test for categorical-categorical
                        contingency = pd.crosstab(X, y)
                        chi2, p_value, _, _ = chi2_contingency(contingency)
                        score = 1 - p_value  # Lower p-value = stronger dependency
                    else:
                        # Mutual information for numeric-categorical
                        X_encoded = LabelEncoder().fit_transform(y)
                        score = mutual_info_regression(X.values.reshape(-1, 1), X_encoded)[0]
                else:
                    # Numeric target
                    if df[predictor_col].dtype == 'object':
                        # Mutual information for categorical-numeric
                        X_encoded = LabelEncoder().fit_transform(X)
                        score = mutual_info_regression(X_encoded.reshape(-1, 1), y)[0]
                    else:
                        # Spearman correlation for numeric-numeric
                        score = abs(spearmanr(X, y)[0])

                # Add to dependent columns if score exceeds threshold
                if score > threshold:
                    dependent_cols.append((predictor_col, score))

            except Exception as e:
                pass

        # Sort by dependency score (highest first)
        dependent_cols.sort(key=lambda x: x[1], reverse=True)
        dependencies[target_col] = [col for col, score in dependent_cols]

    return dependencies

print("Analyzing column dependencies (excluding ID columns)...")
print("This identifies which columns can predict other columns\n")

column_dependencies = analyze_column_dependencies(df_raw, non_id_columns, threshold=0.1)

print("‚úì Column Dependencies Found:\n")
for target, predictors in column_dependencies.items():
    if predictors:
        print(f"  {target} ‚Üê depends on: {', '.join(predictors[:5])}")
    else:
        print(f"  {target} ‚Üê no strong dependencies found")

print(f"\n‚úì Analysis complete")

Analyzing column dependencies (excluding ID columns)...
This identifies which columns can predict other columns

‚úì Column Dependencies Found:

  Gender ‚Üê depends on: Profession
  Age ‚Üê no strong dependencies found
  Annual Income ($) ‚Üê no strong dependencies found
  Spending Score (1-100) ‚Üê no strong dependencies found
  Profession ‚Üê depends on: Gender
  Work Experience ‚Üê no strong dependencies found
  Family Size ‚Üê no strong dependencies found

‚úì Analysis complete


In [None]:
print(f"\n‚úì Total issues detected: {len(issues_df)}")
print(f"  - Missing values: {len(issues_df[issues_df['issue_type']=='missing'])}")
print(f"  - Rule violations: {len(issues_df[issues_df['issue_type'].isin(['outlier', 'invalid'])])}")
print(f"    - Outliers: {len(issues_df[issues_df['issue_type']=='outlier'])}")
print(f"    - Invalid values: {len(issues_df[issues_df['issue_type']=='invalid'])}")

Detecting missing values...
  Found 35 missing values

Detecting rule violations...
  Found 1777 rule violations

‚úì Total issues detected: 1812
  - Missing values: 35
  - Rule violations (outliers): 1777

Sample issues:


Unnamed: 0,row,column,issue_type,current_value,rule_violated
0,79,Profession,missing,,
1,118,Profession,missing,,
2,219,Profession,missing,,
3,237,Profession,missing,,
4,437,Profession,missing,,
5,440,Profession,missing,,
6,498,Profession,missing,,
7,545,Profession,missing,,
8,601,Profession,missing,,
9,641,Profession,missing,,


In [None]:
    # Component 2: Validity Score (0-40 points)
    outlier_issues = issues_df[issues_df['issue_type'].isin(['outlier', 'invalid'])]
    outlier_count = len(outlier_issues)
    outlier_percentage = (outlier_count / total_cells) * 100

Calculating dataset quality score...




‚úì Initial quality score: 60.0/100 (C (Fair))


In [None]:
# Cell 10: Train ML Models (excluding ID columns)
def train_models_with_dependencies(df, dependencies, id_cols):
    """
    Train ML models for each column using ONLY its dependent columns.
    Excludes ID columns from both targets and predictors.
    """
    column_models = {}
    MIN_TRAIN = 10

    for target_col in df.columns:
        # Skip ID columns
        if target_col in id_cols:
            print(f"  ‚è≠Ô∏è {target_col}: ID column - skipping")
            continue

        # Get dependent columns for this target
        predictor_cols = dependencies.get(target_col, [])

        # Remove any ID columns from predictors
        predictor_cols = [c for c in predictor_cols if c not in id_cols]

        if not predictor_cols:
            print(f"  ‚è≠Ô∏è {target_col}: No dependencies - skipping ML model")
            continue

        # Get training data
        train_idx = df[target_col].dropna().index
        if len(train_idx) < MIN_TRAIN:
            print(f"  ‚è≠Ô∏è {target_col}: Insufficient data ({len(train_idx)} rows) - skipping")
            continue

        # Use only dependent columns as features
        X = df[predictor_cols]
        y = df.loc[train_idx, target_col]

        # Separate numeric and categorical predictors
        cat_cols = [c for c in predictor_cols if df[c].dtype == 'object']
        num_cols = [c for c in predictor_cols if df[c].dtype != 'object']

        # Build preprocessing pipeline
        transformers = []
        if num_cols:
            transformers.append(('num', SimpleImputer(strategy='median'), num_cols))
        if cat_cols:
            transformers.append(('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ]), cat_cols))

        if not transformers:
            continue

        preprocessor = ColumnTransformer(transformers, remainder='drop')

        # Choose model based on target type
        label_encoder = None
        if df[target_col].dtype == 'object':
            label_encoder = LabelEncoder()
            y_encoded = label_encoder.fit_transform(y)
            model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
        else:
            y_encoded = y
            model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)

        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        # Train
        try:
            pipeline.fit(X.loc[train_idx], y_encoded)
            column_models[target_col] = {
                'pipeline': pipeline,
                'label_encoder': label_encoder,
                'predictor_cols': predictor_cols
            }
            print(f"  ‚úì {target_col}: Trained on {len(predictor_cols)} dependent columns - {', '.join(predictor_cols[:3])}...")
        except Exception as e:
            print(f"  ‚ö†Ô∏è {target_col}: Training failed - {e}")

    return column_models

print("Training ML models using column dependencies (excluding ID columns)...\n")
column_models = train_models_with_dependencies(df_cleaned, column_dependencies, id_columns)
print(f"\n‚úì Trained {len(column_models)} models")

Training ML models using column dependencies (excluding ID columns)...

  ‚è≠Ô∏è CustomerID: ID column - skipping
  ‚úì Gender: Trained on 1 dependent columns - Profession...
  ‚è≠Ô∏è Age: No dependencies - skipping ML model
  ‚è≠Ô∏è Annual Income ($): No dependencies - skipping ML model
  ‚è≠Ô∏è Spending Score (1-100): No dependencies - skipping ML model
  ‚úì Profession: Trained on 1 dependent columns - Gender...
  ‚è≠Ô∏è Work Experience: No dependencies - skipping ML model
  ‚è≠Ô∏è Family Size: No dependencies - skipping ML model

‚úì Trained 2 models


In [None]:
def ml_predict_with_xai(df, row, col, column_models, rules_df=None, xai_method='both'):
    """
    Predict fix using ML and generate XAI explanation.

    Args:
        xai_method: 'shap', 'lime', or 'both'
    """
    if col not in column_models:
        return None, "No ML model available (column has no dependencies or is an ID column)"

    model_info = column_models[col]
    pipeline = model_info['pipeline']
    label_encoder = model_info['label_encoder']
    predictor_cols = model_info['predictor_cols']

    X_row = df.loc[[row], predictor_cols]

    try:
        pred = pipeline.predict(X_row)[0]

        if label_encoder is not None:
            pred = label_encoder.inverse_transform([int(pred)])[0]
        else:
            # Apply constraints for numeric predictions
            if rules_df is not None and 'constraints' in rules_df.columns:
                rule = rules_df[rules_df['Column'] == col]
                if len(rule) > 0:
                    constraints = str(rule.iloc[0].get('constraints', '')).lower()
                    if isinstance(pred, (int, float)):
                        if 'positive' in constraints and pred <= 0:
                            pred = abs(pred) + 1 if pred != 0 else 1
                        elif 'negative' in constraints and pred >= 0:
                            pred = -abs(pred) - 1 if pred != 0 else -1
                        elif 'non-negative' in constraints and pred < 0:
                            pred = 0
                        elif 'non-positive' in constraints and pred > 0:
                            pred = 0

        # Generate XAI explanation based on method
        xai_text = generate_combined_xai_explanation(
            df, row, col, X_row, pipeline, predictor_cols, label_encoder, xai_method
        )

        return pred, xai_text

    except Exception as e:
        return None, f"Prediction failed: {str(e)}"

‚úì ML prediction with integrated SHAP & LIME ready


In [None]:
def generate_alternatives_from_rules(df, row, col, rules_df):
    """
    Generate alternatives based on:
    1. Rules (allowed values, valid range from min/max)
    2. Statistical measures within valid range
    """
    alternatives = []

    # Check if we have rules for this column
    if rules_df is not None:
        rule = rules_df[rules_df['Column'] == col]
        if len(rule) > 0:
            rule = rule.iloc[0]

            # 1. If allowed values specified, use those
            if pd.notna(rule.get('allowed_values')):
                allowed = [v.strip() for v in str(rule['allowed_values']).split('|')]
                # Return most common allowed values from dataset
                value_counts = df[col].value_counts()
                for val in value_counts.index:
                    if val in allowed:
                        alternatives.append(val)
                    if len(alternatives) >= 3:
                        break

                # If not enough found, add remaining allowed values
                if len(alternatives) < 3:
                    for val in allowed:
                        if val not in alternatives:
                            alternatives.append(val)
                        if len(alternatives) >= 3:
                            break

                if alternatives:
                    return alternatives[:3]

            # 2. For numeric columns with min/max, suggest values in valid range
            if df[col].dtype != 'object':
                min_val = None
                max_val = None

                if pd.notna(rule.get('min')):
                    min_val = float(rule.get('min'))
                if pd.notna(rule.get('max')):
                    max_val = float(rule.get('max'))

                if min_val is not None or max_val is not None:
                    # Filter values in valid range
                    valid_data = df[col].dropna()
                    if min_val is not None:
                        valid_data = valid_data[valid_data >= min_val]
                    if max_val is not None:
                        valid_data = valid_data[valid_data <= max_val]

                    if len(valid_data) > 0:
                        # Get median and quartiles from valid range
                        alternatives.append(valid_data.median())
                        q25 = valid_data.quantile(0.25)
                        q75 = valid_data.quantile(0.75)

                        if q25 not in alternatives:
                            alternatives.append(q25)
                        if q75 not in alternatives:
                            alternatives.append(q75)

                        return list(dict.fromkeys(alternatives))[:3]

    # Fallback: use statistical measures (if no rules or rules don't help)
    if df[col].dtype == 'object':
        # Categorical: use mode
        mode_vals = df[col].mode()
        alternatives.extend(mode_vals.tolist()[:3])
    else:
        # Numeric: use median, mean, mode
        median_val = df[col].median()
        mean_val = df[col].mean()
        mode_vals = df[col].mode()

        if pd.notna(median_val):
            alternatives.append(median_val)
        if pd.notna(mean_val) and mean_val not in alternatives:
            alternatives.append(mean_val)
        if len(mode_vals) > 0:
            for mv in mode_vals:
                if mv not in alternatives:
                    alternatives.append(mv)
                if len(alternatives) >= 3:
                    break

    # Apply constraints if available
    if rules_df is not None and 'constraints' in rules_df.columns:
        rule = rules_df[rules_df['Column'] == col]
        if len(rule) > 0:
            constraints = str(rule.iloc[0].get('constraints', '')).lower()
            if df[col].dtype != 'object':  # Only for numeric
                if 'positive' in constraints:
                    alternatives = [a for a in alternatives if a > 0]
                elif 'negative' in constraints:
                    alternatives = [a for a in alternatives if a < 0]
                elif 'non-negative' in constraints:
                    alternatives = [a for a in alternatives if a >= 0]
                elif 'non-positive' in constraints:
                    alternatives = [a for a in alternatives if a <= 0]

    return list(dict.fromkeys(alternatives))[:3]

‚úì Rule-based alternative generation function ready


In [None]:
        # Get ML suggestion with SHAP & LIME
        ml_suggestion, xai_explanation = ml_predict_with_xai(
            df, row, col, column_models, rules_df, xai_method='both'
        )

Building fixes dataframe with SHAP & LIME explanations...
‚úì Generated 1812 fixes


Unnamed: 0,row,column,issue_type,current_value,rule_violated,ml_suggestion,alternatives,xai_explanation
0,79,Profession,missing,,,Artist,"[Engineer, Doctor]",**ü§ñ ML Prediction based on 1 dependent columns...
1,118,Profession,missing,,,Artist,"[Engineer, Doctor]",**ü§ñ ML Prediction based on 1 dependent columns...
2,219,Profession,missing,,,Artist,"[Engineer, Doctor]",**ü§ñ ML Prediction based on 1 dependent columns...
3,237,Profession,missing,,,Artist,"[Engineer, Doctor]",**ü§ñ ML Prediction based on 1 dependent columns...
4,437,Profession,missing,,,Artist,"[Engineer, Doctor]",**ü§ñ ML Prediction based on 1 dependent columns...


In [None]:
        issue_color = "#e74c3c" if issue_type in ["outlier", "invalid"] else "#f39c12"

NameError: name 'fixes_df' is not defined

In [None]:
# Cell 15: Apply Fixes
def apply_fixes(df, user_selections):
    df_fixed = df.copy()
    for (row, col), value in user_selections.items():
        df_fixed.loc[row, col] = value
    return df_fixed

apply_button = widgets.Button(
    description='‚úÖ Apply All Fixes',
    button_style='success',
    layout=widgets.Layout(width='200px', height='40px')
)

output_area = widgets.Output()

def on_apply_click(b):
    with output_area:
        clear_output()
        print("Applying fixes...")

        global df_final
        df_final = apply_fixes(df_cleaned, user_selections)

        print(f"‚úì Applied {len(user_selections)} fixes")
        print("\nüìä Summary:")
        print(f"  - Original rows: {len(df_raw)}")
        print(f"  - Final rows: {len(df_final)}")
        print(f"  - Issues fixed: {len(user_selections)}")

        # Check remaining issues
        remaining = detect_issues_rules_only(df_final, rules_df)
        print(f"  - Remaining issues: {len(remaining)}")

        # Save
        df_final.to_csv('cleaned_dataset.csv', index=False)
        print("\nüíæ Saved to 'cleaned_dataset.csv'")

        display(Markdown("### Preview of Cleaned Dataset:"))
        display(df_final.head(10))

apply_button.on_click(on_apply_click)

display(apply_button)
display(output_area)

Button(button_style='success', description='‚úÖ Apply All Fixes', layout=Layout(height='40px', width='200px'), s‚Ä¶

Output()

In [None]:
# Cell 16: Download Files
from google.colab import files

print("üì• Download cleaned files:\n")

try:
    files.download('cleaned_dataset.csv')
    print("‚úì Downloaded cleaned_dataset.csv")
except:
    print("‚ö†Ô∏è Run 'Apply All Fixes' button first to generate cleaned_dataset.csv")

try:
    if rules_df is not None:
        files.download('extracted_rules.csv')
        print("‚úì Downloaded extracted_rules.csv")
except:
    pass

üì• Download cleaned files:

‚ö†Ô∏è Run 'Apply All Fixes' button first to generate cleaned_dataset.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úì Downloaded extracted_rules.csv
