<a href="https://colab.research.google.com/github/Ak4nksha/ai-generated-text-detector/blob/main/notebooks/03_feature_engineering_and_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q textstat spacy tqdm
!python -m spacy download en_core_web_sm

In [None]:
import re
import string
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import textstat
import spacy

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tqdm.pandas()  # enable progress bar on apply

# load lightweight spaCy model
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
CSV_PATH = "/content/drive/MyDrive/final_merged_dataset.csv"

df = pd.read_csv(CSV_PATH, engine="python", escapechar='\\',on_bad_lines='skip')
print("Columns:", df.columns)
print(len(df))
df.head()

In [None]:
CSV_PATH = "/content/drive/MyDrive/final_merged_dataset.csv"
TEXT_COL = "text"
LABEL_COL = "label"

# Robust CSV load (handles quotes, bad lines)
df = pd.read_csv(
    CSV_PATH,
    engine="python",
    escapechar="\\",
    on_bad_lines="skip"
)

# Drop rows with missing text/label
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()

# Clean label column: ensure numeric 0/1
df[LABEL_COL] = pd.to_numeric(df[LABEL_COL], errors="coerce")
df[LABEL_COL] = df[LABEL_COL].fillna(0).astype(int)

#additional fixes for NAN values
df["source"] = df["source"].fillna("kaggle")
nan_rows = df['doc_id'].isna()
df.loc[nan_rows, 'doc_id'] = "kaggle_" + df.loc[nan_rows].index.astype(str)


print("Columns:", df.columns)
print("First few rows:")
display(df.head())
print(f"\nDataset shape: {df.shape}")

print("\nLabel value counts:")
# Count class distribution
counts = df[LABEL_COL].value_counts().sort_index()
print(counts)


import matplotlib.pyplot as plt
# Plot
plt.figure(figsize=(6,4))
counts.plot(kind='bar', color=['steelblue', 'darkorange'])
plt.title("Distribution of Human (0) vs AI (1) Texts")
plt.xlabel("Label")
plt.ylabel("Number of Samples")

# Annotate bars with counts
for i, v in enumerate(counts):
    plt.text(i, v + 500, str(v), ha='center', fontsize=12)

plt.show()

In [None]:
def basic_counts(text: str):
    text = text or ""
    num_chars = len(text)

    # sentence split
    sentences = re.split(r"[.!?]+", text)
    sentences = [s.strip() for s in sentences if s.strip()]
    num_sent = len(sentences) if sentences else 1

    # word tokens
    words = re.findall(r"\w+", text)
    num_words = len(words) if words else 1

    avg_sent_len = num_words / num_sent
    return {
        "num_chars": num_chars,
        "num_words": num_words,
        "num_sentences": num_sent,
        "avg_sentence_length": avg_sent_len,
    }


def lexical_diversity(text: str):
    words = re.findall(r"\w+", str(text).lower())
    if not words:
        return {
            "type_token_ratio": 0.0,
            "unique_words": 0,
        }
    unique = set(words)
    ttr = len(unique) / len(words)
    return {
        "type_token_ratio": ttr,
        "unique_words": len(unique),
    }


def punctuation_stats(text: str):
    text = text or ""
    if not text:
        return {
            "pct_punct": 0.0,
            "pct_upper": 0.0,
            "pct_digit": 0.0,
        }
    total = len(text)
    punct = sum(ch in string.punctuation for ch in text)
    upper = sum(ch.isupper() for ch in text)
    digit = sum(ch.isdigit() for ch in text)
    return {
        "pct_punct": punct / total,
        "pct_upper": upper / total,
        "pct_digit": digit / total,
    }


def readability_features(text: str):
    clean = text if isinstance(text, str) else ""
    if len(clean.split()) < 3:
        return {
            "flesch_reading_ease": 0.0,
            "flesch_kincaid_grade": 0.0,
            "gunning_fog": 0.0,
        }
    try:
        fre = textstat.flesch_reading_ease(clean)
        fkg = textstat.flesch_kincaid_grade(clean)
        gf  = textstat.gunning_fog(clean)
    except Exception:
        fre, fkg, gf = 0.0, 0.0, 0.0
    return {
        "flesch_reading_ease": fre,
        "flesch_kincaid_grade": fkg,
        "gunning_fog": gf,
    }


def repetition_features(text: str):
    tokens = re.findall(r"\w+", str(text).lower())
    if len(tokens) < 4:
        return {"bigram_repetition_ratio": 0.0}
    bigrams = list(zip(tokens, tokens[1:]))
    total_bigrams = len(bigrams)
    counts = Counter(bigrams)
    repeated = sum(c for c in counts.values() if c > 1)
    return {
        "bigram_repetition_ratio": repeated / total_bigrams
    }


def pos_features_spacy(text: str):
    doc = nlp(str(text))
    tokens = [t for t in doc if not t.is_space]
    total_tokens = len(tokens)
    if total_tokens == 0:
        return {
            "pos_ratio_NOUN": 0.0,
            "pos_ratio_VERB": 0.0,
            "pos_ratio_ADJ": 0.0,
            "pos_ratio_ADV": 0.0,
            "pos_ratio_PRON": 0.0,
            "pos_ratio_ADP": 0.0,
            "pos_ratio_DET": 0.0,
        }
    counts = Counter(tok.pos_ for tok in tokens)

    def ratio(tag):
        return counts.get(tag, 0) / total_tokens

    return {
        "pos_ratio_NOUN": ratio("NOUN"),
        "pos_ratio_VERB": ratio("VERB"),
        "pos_ratio_ADJ":  ratio("ADJ"),
        "pos_ratio_ADV":  ratio("ADV"),
        "pos_ratio_PRON": ratio("PRON"),
        "pos_ratio_ADP":  ratio("ADP"),
        "pos_ratio_DET":  ratio("DET"),
    }


def sentence_length_stats(text: str):
    sentences = re.split(r"[.!?]+", str(text))
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) < 2:
        return {
            "sentence_length_std": 0.0,
            "sentence_length_mean": len(str(text).split()),
        }
    lens = [len(s.split()) for s in sentences]
    return {
        "sentence_length_std": float(np.std(lens)),
        "sentence_length_mean": float(np.mean(lens)),
    }


def compute_all_features(text: str):
    feats = {}
    feats.update(basic_counts(text))
    feats.update(lexical_diversity(text))
    feats.update(punctuation_stats(text))
    feats.update(readability_features(text))
    feats.update(repetition_features(text))
    feats.update(pos_features_spacy(text))
    feats.update(sentence_length_stats(text))
    return feats

In [None]:
print("Computing features for each text")

features_series = df[TEXT_COL].progress_apply(compute_all_features)
features_df = pd.DataFrame(list(features_series))

print("New feature columns:", features_df.columns.tolist())

df_aug = pd.concat(
    [df.reset_index(drop=True), features_df.reset_index(drop=True)],
    axis=1
)

print("\nAugmented dataframe shape:", df_aug.shape)
df_aug.head()

In [None]:
train_df, temp_df = train_test_split(
    df_aug,
    test_size=0.3,
    stratify=df_aug[LABEL_COL],
    random_state=RANDOM_SEED
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df[LABEL_COL],
    random_state=RANDOM_SEED
)

print(len(train_df), len(val_df), len(test_df))

In [None]:
# ---- Check that there is no overlap between train/val/test ----
train_idx = set(train_df.index)
val_idx   = set(val_df.index)
test_idx  = set(test_df.index)

print("Overlap train ∩ val:", len(train_idx & val_idx))
print("Overlap train ∩ test:", len(train_idx & test_idx))
print("Overlap val ∩ test:", len(val_idx & test_idx))

assert len(train_idx & val_idx) == 0, "Train and val sets overlap!"
assert len(train_idx & test_idx) == 0, "Train and test sets overlap!"
assert len(val_idx & test_idx) == 0, "Val and test sets overlap!"

print("\n✅ No index overlap between train, val, and test.")


In [None]:
numeric_cols = [
    c for c in df_aug.columns
    if c not in [TEXT_COL, LABEL_COL]
]

preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(
            max_features=20000,
            ngram_range=(1,3),
            sublinear_tf=True
        ), TEXT_COL),
        ("num", StandardScaler(), numeric_cols),
    ]
)

clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("logreg", LogisticRegression(
        max_iter=3000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

print("Training model...")
clf.fit(train_df, train_df[LABEL_COL])

val_pred = clf.predict(val_df)
test_pred = clf.predict(test_df)

print("\n=== VALIDATION REPORT ===")
print(classification_report(val_df[LABEL_COL], val_pred, digits=4))

print("\n=== TEST REPORT ===")
print(classification_report(test_df[LABEL_COL], test_pred, digits=4))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---- Extract feature names from the pipeline ----
logreg = clf.named_steps["logreg"]
preproc = clf.named_steps["preprocess"]

# TF-IDF feature names
tfidf = preproc.named_transformers_["tfidf"]
tfidf_features = tfidf.get_feature_names_out()

# Numeric feature names (we already have numeric_cols list)
num_features = np.array(numeric_cols)

# Concatenate all feature names in the same order as the transformed matrix
all_feature_names = np.concatenate([tfidf_features, num_features])

# Logistic regression coefficients for class 1 (binary: shape = (1, n_features))
coefs = logreg.coef_[0]

coef_df = pd.DataFrame({
    "feature": all_feature_names,
    "coef": coefs
})

# ---- Top 20 positive (AI indicators) & Top 20 negative (Human indicators) ----
top_pos = coef_df.nlargest(20, "coef")   # pushes towards class 1 (AI)
top_neg = coef_df.nsmallest(20, "coef") # pushes towards class 0 (Human)

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Positive coefficients (AI)
axes[0].barh(top_pos["feature"], top_pos["coef"])
axes[0].set_title("Top Features Pushing Towards AI (class 1)")
axes[0].invert_yaxis()  # largest at top
axes[0].set_xlabel("Coefficient value")

# Negative coefficients (Human)
axes[1].barh(top_neg["feature"], top_neg["coef"])
axes[1].set_title("Top Features Pushing Towards Human (class 0)")
axes[1].invert_yaxis()
axes[1].set_xlabel("Coefficient value")

plt.tight_layout()
plt.show()

top_pos, top_neg.head()

In [None]:
import pandas as pd

def predict_text_is_ai(text: str):
    """
    Take a raw text string, compute stylometric features,
    and run it through the trained clf pipeline.
    Returns predicted label + probability.
    """
    # 1) Compute features using the same function as training
    feats = compute_all_features(text)

    # 2) Build one-row DataFrame with text + all numeric feature cols
    row = {TEXT_COL: text}
    row.update(feats)

    # Ensure all expected numeric columns are present (in correct order)
    for col in numeric_cols:
        if col not in row:
            row[col] = 0.0  # fallback, should not happen if funcs are consistent

    df_input = pd.DataFrame([row])

    # 3) Predict with the pipeline
    pred = clf.predict(df_input)[0]                # 0 or 1
    proba = clf.predict_proba(df_input)[0]         # [P(human), P(AI)]

    label = "AI-generated" if pred == 1 else "Human-written"
    confidence_ai = proba[1]
    confidence_human = proba[0]

    return {
        "predicted_class": int(pred),
        "label": label,
        "confidence_ai": float(confidence_ai),
        "confidence_human": float(confidence_human),
    }

# Simple interactive loop
while True:
    user_text = input("\nPaste text to test (or type 'quit' to stop):\n> ")
    if user_text.lower().strip() in ["quit", "exit", "q"]:
        print("Exiting.")
        break

    result = predict_text_is_ai(user_text)
    print(f"\nPrediction: {result['label']}")
    print(f"  → P(Human) = {result['confidence_human']:.3f}")
    print(f"  → P(AI)    = {result['confidence_ai']:.3f}")