In [4]:
# datacrine.py
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from transformers import pipeline

class Datacrine:
    def __init__(self):
        # Sentiment model (used for IMDb checks)
        self.sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
        self.logs = []

    # === Cleaning Functions ===
    def impute_numeric_mean(self, df, cols):
        for col in cols:
            mean_val = df[col].mean()
            df[col].fillna(mean_val, inplace=True)
            self.logs.append(f"Imputed missing values in {col} with mean={mean_val:.2f}")
        return df

    def cap_outliers_iqr(self, df, cols):
        for col in cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
            df[col] = np.clip(df[col], lower, upper)
            self.logs.append(f"Capped outliers in {col} to [{lower:.2f}, {upper:.2f}]")
        return df

    def uci_semantic_rules(self, df):
        flags, rationales = [], []
        for _, row in df.iterrows():
            reasons = []
            if row["X1"] > 1_000_000:
                reasons.append("Unrealistic credit limit")
            if row["X5"] < 18:
                reasons.append("Age < 18")
            if reasons:
                flags.append(1)
                rationales.append("; ".join(reasons))
            else:
                flags.append(0)
                rationales.append("Valid")
        df["llm_flag"] = flags
        df["llm_rationale"] = rationales
        self.logs.append("Applied semantic rules to UCI dataset")
        return df

    def imdb_label_check(self, df):
        preds = self.sentiment_model(df["review"].tolist()[:100])  # sample 100 for speed
        flags, rationales = [], []
        for i, pred in enumerate(preds):
            gold = df.iloc[i]["sentiment"]
            if (gold == "positive" and pred["label"] == "NEGATIVE") or \
               (gold == "negative" and pred["label"] == "POSITIVE"):
                flags.append(1)
                rationales.append(f"Mismatch: gold={gold}, pred={pred['label']} ({pred['score']:.2f})")
            else:
                flags.append(0)
                rationales.append("Match")
        df["llm_flag"] = flags + [0]*(len(df)-len(flags))
        df["llm_rationale"] = rationales + ["Not checked"]*(len(df)-len(rationales))
        self.logs.append("Checked IMDb dataset with LLM sentiment model")
        return df

    # === Training Functions ===
    def train_and_evaluate(self, df, target_col, dataset="uci"):
        X = df.drop(columns=[target_col], errors="ignore")
        y = df[target_col]
        results = {}

        # Random Forest
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X, y)
        y_pred = rf.predict(X)
        results["RandomForest"] = {
            "Accuracy": accuracy_score(y, y_pred),
            "F1": f1_score(y, y_pred, average="binary")
        }

        # XGBoost
        dtrain = xgb.DMatrix(X, label=y)
        params = {"objective": "binary:logistic", "eval_metric": "logloss"}
        bst = xgb.train(params, dtrain, num_boost_round=20)
        y_pred = (bst.predict(dtrain) > 0.5).astype(int)
        results["XGBoost"] = {
            "Accuracy": accuracy_score(y, y_pred),
            "F1": f1_score(y, y_pred, average="binary")
        }

        self.logs.append(f"Trained & evaluated models on {dataset} dataset")
        return results

    # === Export Functions ===
    def export_logs(self, filename):
        with open(filename, "w") as f:
            for log in self.logs:
                f.write(log + "\n")

    def save_dataset(self, df, filename):
        df.to_csv(filename, index=False)


In [5]:
from datacrine import Datacrine
dc = Datacrine()


ModuleNotFoundError: No module named 'datacrine'

In [6]:
# Datacrine Demo Notebook
# Author: Aryan Singh
# Thesis: "Datacrine: A Modular, Interpretable AI Framework for Smart Data Cleaning"

# === Setup ===
import pandas as pd
from ucimlrepo import fetch_ucirepo
import kagglehub
from datacrine import Datacrine
import matplotlib.pyplot as plt

# Initialize framework
dc = Datacrine()


ModuleNotFoundError: No module named 'datacrine'

In [7]:
# Load dataset
uci_data = fetch_ucirepo(id=350)
uci_credit = pd.concat([uci_data.data.features, uci_data.data.targets], axis=1)

print("UCI Credit Dataset Shape:", uci_credit.shape)
uci_credit.head()


UCI Credit Dataset Shape: (30000, 24)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [8]:
# Clean dataset
uci_cleaned = dc.impute_numeric_mean(uci_credit.copy(), ["X5"])   # AGE
uci_cleaned = dc.cap_outliers_iqr(uci_cleaned, ["X1"])            # LIMIT_BAL
uci_cleaned = dc.uci_semantic_rules(uci_cleaned)
uci_cleaned["Y"] = uci_credit["Y"].values

# Train + evaluate
uci_results = dc.train_and_evaluate(uci_cleaned, target_col="Y", dataset="uci")
print("UCI Results:", uci_results)


NameError: name 'dc' is not defined

In [9]:
datasets = ["UCI", "IMDb"]
models = ["RandomForest", "XGBoost"]
metrics = ["Accuracy", "F1"]

results = {
    "UCI": uci_results,
    "IMDb": imdb_results
}

for metric in metrics:
    plt.figure(figsize=(6,4))
    vals = []
    labels = []
    for dataset in datasets:
        for model in models:
            vals.append(results[dataset][model][metric])
            labels.append(f"{dataset}-{model}")
    plt.bar(labels, vals, color=["skyblue","orange","skyblue","orange"])
    plt.title(f"{metric} Comparison")
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.show()


NameError: name 'uci_results' is not defined