In [3]:
!pip install kagglehub ucimlrepo


Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.13


In [4]:
import kagglehub
import pandas as pd

# Download dataset
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Dataset path:", path)

# Load IMDb data
imdb = pd.read_csv(f"{path}/IMDB Dataset.csv")
print(imdb.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:02<00:00, 9.39MB/s]


Extracting files...
Dataset path: C:\Users\aryan\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [5]:
from ucimlrepo import fetch_ucirepo

# Fetch dataset (ID 350 = Credit Default)
credit_data = fetch_ucirepo(id=350)

X = credit_data.data.features
y = credit_data.data.targets

uci_credit = pd.concat([X, y], axis=1)
print(uci_credit.head())

       X1  X2  X3  X4  X5  X6  X7  X8  X9  X10  ...    X15    X16    X17  \
0   20000   2   2   1  24   2   2  -1  -1   -2  ...      0      0      0   
1  120000   2   2   2  26  -1   2   0   0    0  ...   3272   3455   3261   
2   90000   2   2   2  34   0   0   0   0    0  ...  14331  14948  15549   
3   50000   2   2   1  37   0   0   0   0    0  ...  28314  28959  29547   
4   50000   1   2   1  57  -1   0  -1   0    0  ...  20940  19146  19131   

    X18    X19    X20   X21   X22   X23  Y  
0     0    689      0     0     0     0  1  
1     0   1000   1000  1000     0  2000  1  
2  1518   1500   1000  1000  1000  5000  0  
3  2000   2019   1200  1100  1069  1000  0  
4  2000  36681  10000  9000   689   679  0  

[5 rows x 24 columns]


In [6]:
import pandas as pd
import numpy as np
from transformers import pipeline

In [7]:
def impute_numeric_mean(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].fillna(df[c].mean())
    return df

def cap_outliers_iqr(df: pd.DataFrame, cols, k=1.5):
    for c in cols:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
            q1, q3 = df[c].quantile(0.25), df[c].quantile(0.75)
            iqr = q3 - q1
            low, high = q1 - k*iqr, q3 + k*iqr
            df[c] = df[c].clip(lower=low, upper=high)
    return df

def drop_dupes(df: pd.DataFrame, subset=None):
    return df.drop_duplicates(subset=subset)

In [10]:
# small local Hugging Face model for IMDb sentiment
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

def imdb_label_check(df: pd.DataFrame, text_col="review", label_col="sentiment"):
    """
    Compare IMDb dataset labels with DistilBERT local sentiment prediction.
    """
    out = []
    for txt in df[text_col].astype(str).fillna(""):
        out.append(sentiment_pipe(txt[:512])[0]["label"].lower())  # positive / negative
    df["llm_sentiment"] = out

    flags, rats = [], []
    for _, r in df.iterrows():
        gold = str(r[label_col]).lower()
        pred = r["llm_sentiment"]
        if gold.startswith("pos") and pred == "negative":
            flags.append(True)
            rats.append("Mismatch: labeled POS but text looks NEG.")
        elif gold.startswith("neg") and pred == "positive":
            flags.append(True)
            rats.append("Mismatch: labeled NEG but text looks POS.")
        else:
            flags.append(False)
            rats.append("Aligned with sentiment.")
    df["llm_flag"] = flags
    df["llm_rationale"] = rats
    return df

def uci_semantic_rules(df, limit_col="LIMIT_BAL", age_col="AGE"):
    """
    Apply domain rules on UCI Credit dataset for plausibility checks.
    """
    flags, reasons = [], []
    for _, r in df.iterrows():
        msg = []
        if r[limit_col] > 1_000_000:
            msg.append("Unrealistic credit limit (>1,000,000).")
        if r[age_col] < 18:
            msg.append("Invalid age (<18).")
        if msg:
            flags.append(True)
            reasons.append("; ".join(msg))
        else:
            flags.append(False)
            reasons.append("No issues detected.")
    df["llm_flag"] = flags
    df["llm_rationale"] = reasons
    return df

Device set to use cpu


In [11]:
print("=== IMDb sample run ===")
sample_imdb = imdb.sample(5, random_state=42).copy()
sample_imdb = imdb_label_check(sample_imdb, text_col="review", label_col="sentiment")
print(sample_imdb[["review", "sentiment", "llm_sentiment", "llm_flag", "llm_rationale"]])

=== IMDb sample run ===
                                                  review sentiment  \
33553  I really liked this Summerslam due to the look...  positive   
9427   Not many television shows appeal to quite as m...  positive   
199    The film quickly gets to a major chase scene w...  negative   
12447  Jane Austen would definitely approve of this o...  positive   
39489  Expectations were somewhat high for me when I ...  negative   

      llm_sentiment  llm_flag                              llm_rationale  
33553      positive     False                    Aligned with sentiment.  
9427       positive     False                    Aligned with sentiment.  
199        negative     False                    Aligned with sentiment.  
12447      positive     False                    Aligned with sentiment.  
39489      positive      True  Mismatch: labeled NEG but text looks POS.  


In [13]:
print(uci_credit.columns.tolist())

['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'Y']


In [14]:
def uci_semantic_rules(df):
    """
    Apply domain checks to UCI Credit dataset (X1=LIMIT_BAL, X5=AGE).
    """
    flags, reasons = [], []
    for _, r in df.iterrows():
        msg = []
        # Credit limit plausibility
        if pd.notna(r["X1"]) and r["X1"] > 1_000_000:
            msg.append("Unrealistic credit limit (>1,000,000).")
        if pd.notna(r["X1"]) and r["X1"] <= 0:
            msg.append("Invalid credit limit (<=0).")

        # Age plausibility
        if pd.notna(r["X5"]) and r["X5"] < 18:
            msg.append("Invalid age (<18).")
        if pd.notna(r["X5"]) and r["X5"] > 100:
            msg.append("Unrealistic age (>100).")

        if msg:
            flags.append(True)
            reasons.append("; ".join(msg))
        else:
            flags.append(False)
            reasons.append("No issues detected.")
    df["llm_flag"] = flags
    df["llm_rationale"] = reasons
    return df

In [15]:
print("\n=== UCI Credit sample run ===")
sample_uci = uci_credit.sample(5, random_state=42).copy()

# Traditional cleaning
sample_uci = impute_numeric_mean(sample_uci, ["X5"])   # AGE
sample_uci = cap_outliers_iqr(sample_uci, ["X1"])      # LIMIT_BAL

# Semantic checks
sample_uci = uci_semantic_rules(sample_uci)

print(sample_uci[["X1", "X5", "llm_flag", "llm_rationale"]])


=== UCI Credit sample run ===
           X1  X5  llm_flag        llm_rationale
2308    30000  25     False  No issues detected.
22404  150000  26     False  No issues detected.
23397   70000  32     False  No issues detected.
25058  130000  49     False  No issues detected.
2664    50000  36     False  No issues detected.


In [17]:
def run_pipeline(dataset="uci", n=5):
    """
    Run Datacrine pipeline on either IMDb or UCI dataset.
    dataset: "uci" or "imdb"
    n: number of sample rows to show
    """
    if dataset.lower() == "imdb":
        print("\n=== Running Datacrine on IMDb Dataset ===")
        df = imdb.sample(n, random_state=42).copy()
        df = imdb_label_check(df, text_col="review", label_col="sentiment")
        display_cols = ["review", "sentiment", "llm_sentiment", "llm_flag", "llm_rationale"]
    
    elif dataset.lower() == "uci":
        print("\n=== Running Datacrine on UCI Credit Dataset ===")
        df = uci_credit.sample(n, random_state=42).copy()
        df = impute_numeric_mean(df, ["X5"])   # AGE
        df = cap_outliers_iqr(df, ["X1"])      # LIMIT_BAL
        df = uci_semantic_rules(df)            # semantic checks
        display_cols = ["X1", "X5", "llm_flag", "llm_rationale"]
    
    else:
        raise ValueError("Dataset must be 'uci' or 'imdb'")
    
    return df[display_cols]


In [18]:
run_pipeline("imdb")


=== Running Datacrine on IMDb Dataset ===


Unnamed: 0,review,sentiment,llm_sentiment,llm_flag,llm_rationale
33553,I really liked this Summerslam due to the look...,positive,positive,False,Aligned with sentiment.
9427,Not many television shows appeal to quite as m...,positive,positive,False,Aligned with sentiment.
199,The film quickly gets to a major chase scene w...,negative,negative,False,Aligned with sentiment.
12447,Jane Austen would definitely approve of this o...,positive,positive,False,Aligned with sentiment.
39489,Expectations were somewhat high for me when I ...,negative,positive,True,Mismatch: labeled NEG but text looks POS.


In [19]:
run_pipeline("uci")


=== Running Datacrine on UCI Credit Dataset ===


Unnamed: 0,X1,X5,llm_flag,llm_rationale
2308,30000,25,False,No issues detected.
22404,150000,26,False,No issues detected.
23397,70000,32,False,No issues detected.
25058,130000,49,False,No issues detected.
2664,50000,36,False,No issues detected.


In [21]:
def summarize_flags(df):
    if "llm_flag" not in df.columns:
        print("⚠️ No llm_flag column found. Did you run the pipeline?")
        return
    total = len(df)
    flagged = df["llm_flag"].sum()
    pct = (flagged / total) * 100 if total > 0 else 0
    print(f"{total} rows checked, {flagged} flagged ({pct:.1f}% anomalies detected)")

In [22]:
result_uci = run_pipeline("uci", n=20)
print(result_uci)
summarize_flags(result_uci)

result_imdb = run_pipeline("imdb", n=20)
print(result_imdb)
summarize_flags(result_imdb)


=== Running Datacrine on UCI Credit Dataset ===
           X1  X5  llm_flag        llm_rationale
2308    30000  25     False  No issues detected.
22404  150000  26     False  No issues detected.
23397   70000  32     False  No issues detected.
25058  130000  49     False  No issues detected.
2664    50000  36     False  No issues detected.
8511    50000  29     False  No issues detected.
5148   110000  33     False  No issues detected.
7790   140000  32     False  No issues detected.
11311  256250  38     False  No issues detected.
19043   80000  23     False  No issues detected.
10784   80000  33     False  No issues detected.
22246   90000  25     False  No issues detected.
3268    80000  34     False  No issues detected.
15947   30000  30     False  No issues detected.
14977  200000  29     False  No issues detected.
14474  110000  39     False  No issues detected.
28694  100000  33     False  No issues detected.
14054   30000  51     False  No issues detected.
26545  200000  50   

In [23]:
import os

def save_cleaned_data(df, dataset="uci"):
    """
    Save cleaned dataset to disk in data/cleaned folder.
    """
    os.makedirs("data/cleaned", exist_ok=True)
    path = f"data/cleaned/{dataset}_cleaned.csv"
    df.to_csv(path, index=False)
    print(f"✅ Cleaned {dataset} dataset saved at: {path}")

In [24]:
# Run on UCI and save
uci_result = run_pipeline("uci", n=2000)  # larger sample, or use full dataset later
summarize_flags(uci_result)
save_cleaned_data(uci_result, "uci")

# Run on IMDb and save
imdb_result = run_pipeline("imdb", n=2000)
summarize_flags(imdb_result)
save_cleaned_data(imdb_result, "imdb")


=== Running Datacrine on UCI Credit Dataset ===
2000 rows checked, 0 flagged (0.0% anomalies detected)
✅ Cleaned uci dataset saved at: data/cleaned/uci_cleaned.csv

=== Running Datacrine on IMDb Dataset ===
2000 rows checked, 370 flagged (18.5% anomalies detected)
✅ Cleaned imdb dataset saved at: data/cleaned/imdb_cleaned.csv


In [3]:
!pip install xgboost



In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

def train_and_evaluate(df, dataset="uci", target_col="Y"):
    """
    Train Random Forest & XGBoost on dataset and report accuracy/F1.
    Works for UCI (classification). For IMDb, we'll map sentiment to 0/1.
    """
    # Prepare data
    if dataset == "uci":
        X = df.drop(columns=[target_col, "llm_flag", "llm_rationale"], errors="ignore")
        y = df[target_col]
    elif dataset == "imdb":
        df = df.copy()
        df["label_num"] = df["sentiment"].map({"positive": 1, "negative": 0})
        X = df["review"]  # text input
        y = df["label_num"]
        # Convert text → TF-IDF for classical models
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
        X = vectorizer.fit_transform(X)
    else:
        raise ValueError("Dataset must be 'uci' or 'imdb'")

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    results = {}

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_test)
    results["RandomForest"] = {
        "Accuracy": accuracy_score(y_test, preds_rf),
        "F1": f1_score(y_test, preds_rf)
    }

    # XGBoost
    xgb_model = xgb.XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
    xgb_model.fit(X_train, y_train)
    preds_xgb = xgb_model.predict(X_test)
    results["XGBoost"] = {
        "Accuracy": accuracy_score(y_test, preds_xgb),
        "F1": f1_score(y_test, preds_xgb)
    }

    return results

In [7]:
import pandas as pd
from ucimlrepo import fetch_ucirepo 

# fetch dataset (ID 350 = Credit Default)
credit_data = fetch_ucirepo(id=350)

X = credit_data.data.features
y = credit_data.data.targets

uci_credit = pd.concat([X, y], axis=1)
print(uci_credit.head())

       X1  X2  X3  X4  X5  X6  X7  X8  X9  X10  ...    X15    X16    X17  \
0   20000   2   2   1  24   2   2  -1  -1   -2  ...      0      0      0   
1  120000   2   2   2  26  -1   2   0   0    0  ...   3272   3455   3261   
2   90000   2   2   2  34   0   0   0   0    0  ...  14331  14948  15549   
3   50000   2   2   1  37   0   0   0   0    0  ...  28314  28959  29547   
4   50000   1   2   1  57  -1   0  -1   0    0  ...  20940  19146  19131   

    X18    X19    X20   X21   X22   X23  Y  
0     0    689      0     0     0     0  1  
1     0   1000   1000  1000     0  2000  1  
2  1518   1500   1000  1000  1000  5000  0  
3  2000   2019   1200  1100  1069  1000  0  
4  2000  36681  10000  9000   689   679  0  

[5 rows x 24 columns]


In [8]:
uci_raw_results = train_and_evaluate(uci_credit, dataset="uci", target_col="Y")
print("UCI Raw Results:", uci_raw_results)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


UCI Raw Results: {'RandomForest': {'Accuracy': 0.8123333333333334, 'F1': 0.46227316141356256}, 'XGBoost': {'Accuracy': 0.8136666666666666, 'F1': 0.4599033816425121}}


In [10]:
# === MASTER INIT FOR DATACRINE ===
import os
import pandas as pd
import numpy as np
from transformers import pipeline
from ucimlrepo import fetch_ucirepo
import kagglehub

# -------------------------------
# Load datasets
# -------------------------------
# IMDb
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
imdb = pd.read_csv(f"{path}/IMDB Dataset.csv")

# UCI Credit
credit_data = fetch_ucirepo(id=350)
X = credit_data.data.features
y = credit_data.data.targets
uci_credit = pd.concat([X, y], axis=1)

# -------------------------------
# Cleaning functions
# -------------------------------
def impute_numeric_mean(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].fillna(df[c].mean())
    return df

def cap_outliers_iqr(df: pd.DataFrame, cols, k=1.5):
    for c in cols:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
            q1, q3 = df[c].quantile(0.25), df[c].quantile(0.75)
            iqr = q3 - q1
            low, high = q1 - k*iqr, q3 + k*iqr
            df[c] = df[c].clip(lower=low, upper=high)
    return df

# -------------------------------
# Semantic / LLM functions
# -------------------------------
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

def imdb_label_check(df: pd.DataFrame, text_col="review", label_col="sentiment"):
    out = []
    for txt in df[text_col].astype(str).fillna(""):
        out.append(sentiment_pipe(txt[:512])[0]["label"].lower())
    df["llm_sentiment"] = out

    flags, rats = [], []
    for _, r in df.iterrows():
        gold = str(r[label_col]).lower()
        pred = r["llm_sentiment"]
        if gold.startswith("pos") and pred == "negative":
            flags.append(True); rats.append("Mismatch: labeled POS but looks NEG.")
        elif gold.startswith("neg") and pred == "positive":
            flags.append(True); rats.append("Mismatch: labeled NEG but looks POS.")
        else:
            flags.append(False); rats.append("Aligned with sentiment.")
    df["llm_flag"] = flags
    df["llm_rationale"] = rats
    return df

def uci_semantic_rules(df):
    flags, reasons = [], []
    for _, r in df.iterrows():
        msg = []
        if pd.notna(r["X1"]) and r["X1"] > 1_000_000:
            msg.append("Unrealistic credit limit (>1,000,000).")
        if pd.notna(r["X1"]) and r["X1"] <= 0:
            msg.append("Invalid credit limit (<=0).")
        if pd.notna(r["X5"]) and r["X5"] < 18:
            msg.append("Invalid age (<18).")
        if pd.notna(r["X5"]) and r["X5"] > 100:
            msg.append("Unrealistic age (>100).")
        if msg:
            flags.append(True); reasons.append("; ".join(msg))
        else:
            flags.append(False); reasons.append("No issues detected.")
    df["llm_flag"] = flags
    df["llm_rationale"] = reasons
    return df

# -------------------------------
# Unified pipeline
# -------------------------------
def run_pipeline(dataset="uci", n=5):
    if dataset.lower() == "imdb":
        print("\n=== Running Datacrine on IMDb Dataset ===")
        df = imdb.sample(n, random_state=42).copy()
        df = imdb_label_check(df, text_col="review", label_col="sentiment")
        display_cols = ["review", "sentiment", "llm_sentiment", "llm_flag", "llm_rationale"]
    elif dataset.lower() == "uci":
        print("\n=== Running Datacrine on UCI Credit Dataset ===")
        df = uci_credit.sample(n, random_state=42).copy()
        df = impute_numeric_mean(df, ["X5"])
        df = cap_outliers_iqr(df, ["X1"])
        df = uci_semantic_rules(df)
        display_cols = ["X1", "X5", "llm_flag", "llm_rationale"]
    else:
        raise ValueError("Dataset must be 'uci' or 'imdb'")
    return df[display_cols]

# -------------------------------
# Summary helper
# -------------------------------
def summarize_flags(df):
    if "llm_flag" not in df.columns:
        print("⚠️ No llm_flag column found. Did you run the pipeline?")
        return
    total = len(df)
    flagged = df["llm_flag"].sum()
    pct = (flagged / total) * 100 if total > 0 else 0
    print(f"{total} rows checked, {flagged} flagged ({pct:.1f}% anomalies detected)")

Device set to use cpu


In [11]:
uci_cleaned = run_pipeline("uci", n=len(uci_credit))
uci_cleaned["Y"] = uci_credit["Y"].values
uci_clean_results = train_and_evaluate(uci_cleaned, dataset="uci", target_col="Y")
print("UCI Cleaned Results:", uci_clean_results)


=== Running Datacrine on UCI Credit Dataset ===
UCI Cleaned Results: {'RandomForest': {'Accuracy': 0.7656666666666667, 'F1': 0.03566529492455418}, 'XGBoost': {'Accuracy': 0.7768333333333334, 'F1': 0.0014914243102162564}}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# === IMDb Raw vs Cleaned Evaluation ===

# Convert sentiment to binary (pos=1, neg=0)
imdb_bin = imdb.copy()
imdb_bin["label"] = imdb_bin["sentiment"].map({"positive": 1, "negative": 0})

# Raw IMDb evaluation
imdb_raw_results = train_and_evaluate(
    imdb_bin.rename(columns={"label": "Y"}), 
    dataset="imdb", 
    target_col="Y"
)
print("IMDb Raw Results:", imdb_raw_results)

# Clean IMDb via pipeline
imdb_cleaned = run_pipeline("imdb", n=len(imdb_bin))
imdb_cleaned["Y"] = imdb_bin["label"].values

# Evaluate cleaned IMDb
imdb_clean_results = train_and_evaluate(
    imdb_cleaned.rename(columns={"llm_sentiment": "Y"}), 
    dataset="imdb", 
    target_col="Y"
)
print("IMDb Cleaned Results:", imdb_clean_results)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


IMDb Raw Results: {'RandomForest': {'Accuracy': 0.8586, 'F1': 0.8571428571428571}, 'XGBoost': {'Accuracy': 0.8623, 'F1': 0.8653827353602502}}

=== Running Datacrine on IMDb Dataset ===


In [None]:
import matplotlib.pyplot as plt

# Collect results
datasets = ["UCI", "IMDb"]
models = ["RandomForest", "XGBoost"]
metrics = ["Accuracy", "F1"]

results = {
    "UCI Raw": uci_raw_results,
    "UCI Cleaned": uci_clean_results,
    "IMDb Raw": imdb_raw_results,
    "IMDb Cleaned": imdb_clean_results
}

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for i, metric in enumerate(metrics):
    ax = axes[i]
    for model in models:
        raw_vals = [results[f"{ds} Raw"][model][metric] for ds in datasets]
        clean_vals = [results[f"{ds} Cleaned"][model][metric] for ds in datasets]
        
        x = range(len(datasets))
        ax.plot(x, raw_vals, marker="o", label=f"{model} Raw")
        ax.plot(x, clean_vals, marker="x", linestyle="--", label=f"{model} Cleaned")
    
    ax.set_title(metric)
    ax.set_xticks(x)
    ax.set_xticklabels(datasets)
    ax.set_ylabel(metric)
    ax.legend()

plt.suptitle("Datacrine: Raw vs Cleaned Performance (UCI & IMDb)")
plt.show()


In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score

class Datacrine:
    def __init__(self):
        # Load LLM model once
        self.sentiment_pipe = pipeline(
            "sentiment-analysis",
            model="distilbert-base-uncased-finetuned-sst-2-english"
        )
        self.logs = []  # to store cleaning/audit logs

    def log(self, message):
        """Keep track of what Datacrine is doing."""
        print(message)
        self.logs.append(message)

    # --- Cleaning Functions ---
    def impute_numeric_mean(self, df, cols):
        for c in cols:
            if c in df.columns:
                mean_val = df[c].mean()
                df[c] = df[c].fillna(mean_val)
                self.log(f"Imputed missing values in {c} with mean={mean_val:.2f}")
        return df

    def cap_outliers_iqr(self, df, cols, k=1.5):
        for c in cols:
            if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
                q1, q3 = df[c].quantile(0.25), df[c].quantile(0.75)
                iqr = q3 - q1
                low, high = q1 - k*iqr, q3 + k*iqr
                df[c] = df[c].clip(lower=low, upper=high)
                self.log(f"Capped outliers in {c} to [{low:.2f}, {high:.2f}]")
        return df

    def uci_semantic_rules(self, df):
        flags, reasons = [], []
        for _, r in df.iterrows():
            msg = []
            if pd.notna(r["X1"]) and r["X1"] > 1_000_000:
                msg.append("Unrealistic credit limit (>1,000,000).")
            if pd.notna(r["X1"]) and r["X1"] <= 0:
                msg.append("Invalid credit limit (<=0).")
            if pd.notna(r["X5"]) and r["X5"] < 18:
                msg.append("Invalid age (<18).")
            if pd.notna(r["X5"]) and r["X5"] > 100:
                msg.append("Unrealistic age (>100).")
            if msg:
                flags.append(True); reasons.append("; ".join(msg))
            else:
                flags.append(False); reasons.append("No issues detected.")
        df["llm_flag"] = flags
        df["llm_rationale"] = reasons
        self.log("Applied semantic rules to UCI Credit dataset.")
        return df

    def imdb_label_check(self, df, text_col="review", label_col="sentiment"):
        out = []
        for txt in df[text_col].astype(str).fillna(""):
            out.append(self.sentiment_pipe(txt[:512])[0]["label"].lower())
        df["llm_sentiment"] = out

        flags, rats = [], []
        for _, r in df.iterrows():
            gold = str(r[label_col]).lower()
            pred = r["llm_sentiment"]
            if gold.startswith("pos") and pred == "negative":
                flags.append(True); rats.append("Mismatch: labeled POS but looks NEG.")
            elif gold.startswith("neg") and pred == "positive":
                flags.append(True); rats.append("Mismatch: labeled NEG but looks POS.")
            else:
                flags.append(False); rats.append("Aligned with sentiment.")
        df["llm_flag"] = flags
        df["llm_rationale"] = rats
        self.log("Checked IMDb dataset with LLM sentiment model.")
        return df

    # --- Evaluation Functions ---
    def train_and_evaluate(self, df, target_col="Y", dataset="uci"):
        X = df.drop(columns=[target_col])
        y = df[target_col]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        results = {}

        # RandomForest
        rf = RandomForestClassifier(random_state=42)
        rf.fit(X_train, y_train)
        y_pred_rf = rf.predict(X_test)
        results["RandomForest"] = {
            "Accuracy": accuracy_score(y_test, y_pred_rf),
            "F1": f1_score(y_test, y_pred_rf, average="weighted")
        }

        # XGBoost
        xgb_model = xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False)
        xgb_model.fit(X_train, y_train)
        y_pred_xgb = xgb_model.predict(X_test)
        results["XGBoost"] = {
            "Accuracy": accuracy_score(y_test, y_pred_xgb),
            "F1": f1_score(y_test, y_pred_xgb, average="weighted")
        }

        self.log(f"Trained & evaluated models on {dataset} dataset.")
        return results

    # --- Export ---
    def export_logs(self, filename="datacrine_log.txt"):
        with open(filename, "w") as f:
            for line in self.logs:
                f.write(line + "\n")
        self.log(f"Logs exported to {filename}")

    def save_dataset(self, df, filename="datacrine_cleaned.csv"):
        df.to_csv(filename, index=False)
        self.log(f"Cleaned dataset saved to {filename}")

In [None]:
dc = Datacrine()

# UCI Cleaning
uci_cleaned = dc.impute_numeric_mean(uci_credit.copy(), ["X5"])
uci_cleaned = dc.cap_outliers_iqr(uci_cleaned, ["X1"])
uci_cleaned = dc.uci_semantic_rules(uci_cleaned)
uci_cleaned["Y"] = uci_credit["Y"].values
uci_results = dc.train_and_evaluate(uci_cleaned, target_col="Y", dataset="uci")
print("UCI Results:", uci_results)

# IMDb Cleaning (subset for speed!)
imdb_subset = imdb.sample(2000, random_state=42).copy()
imdb_cleaned = dc.imdb_label_check(imdb_subset)
imdb_cleaned["Y"] = imdb_subset["sentiment"].map({"positive": 1, "negative": 0})
imdb_results = dc.train_and_evaluate(imdb_cleaned.rename(columns={"llm_sentiment":"Y"}), target_col="Y", dataset="imdb")
print("IMDb Results:", imdb_results)

# Export logs & cleaned dataset
dc.export_logs("datacrine_audit.txt")
dc.save_dataset(uci_cleaned, "uci_credit_cleaned.csv")