In [6]:
import numpy as np
import json

# ========== STEP 1.1: Load train .npy files ==========
human_data = np.load('data/train/train_human.npy')  # shape: (N1, 100, 768)
ai_data = np.load('data/train/train_ai.npy')        # shape: (N2, 100, 768)

# ========== STEP 1.2: Create labels ==========
human_labels = np.zeros(len(human_data), dtype=int)  # Label 0 = Human
ai_labels = np.ones(len(ai_data), dtype=int)         # Label 1 = AI

# ========== STEP 1.3: Combine train data ==========
X_train = np.concatenate([human_data, ai_data], axis=0)   # shape: (N_total, 100, 768)
y_train = np.concatenate([human_labels, ai_labels], axis=0)

print("Train Data Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)


# ========== STEP 1.4: Robust JSONL loader (flattened) ==========
def load_jsonl_embeddings(path, expected_shape=(100, 768)):
    """
    Load and flatten valid embeddings from a .jsonl file.
    Each line may contain a list of (100, 768) items.
    Returns: (N_total, 100, 768)
    """
    embeddings = []
    skipped = 0
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                item = json.loads(line)
                features = item['features']
                for j, feat in enumerate(features):
                    arr = np.array(feat)
                    if arr.shape == expected_shape:
                        embeddings.append(arr)
                    else:
                        skipped += 1
                        print(f"Skipped sample {i}.{j}: wrong shape {arr.shape}")
            except Exception as e:
                skipped += 1
                print(f"Error parsing line {i}: {e}")
    print(f"Loaded {len(embeddings)} samples from {path} | Skipped: {skipped}")
    return np.stack(embeddings)


# ========== STEP 1.5: Load validation embeddings ==========
val_path = 'data/val/validation.jsonl'
X_val = load_jsonl_embeddings(val_path)
print("Validation shape:", X_val.shape)

# ========== STEP 1.6: Load test embeddings ==========
test_path = 'data/test/test_features.jsonl'
X_test = load_jsonl_embeddings(test_path)
print("Test shape:", X_test.shape)


Train Data Shape: (16322, 100, 768)
Train Labels Shape: (16322,)
Loaded 220 samples from data/val/validation.jsonl | Skipped: 0
Validation shape: (220, 100, 768)
Loaded 1686 samples from data/test/test_features.jsonl | Skipped: 0
Test shape: (1686, 100, 768)


#Step 2: Feature Engineering — prepare_features()
We’ll implement:

Mean pooling (→ 768)

Segment norms (early/mid/late)

Positional entropy

Token embedding variance

Cosine similarity to class centroids

Final DataFrame output

In [7]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm


In [8]:
# Mean pooling (N, 100, 768) → (N, 768)
def mean_pool(X):
    return np.mean(X, axis=1)

# Compute centroids using mean-pooled train data
human_centroid = mean_pool(human_data).mean(axis=0)
ai_centroid = mean_pool(ai_data).mean(axis=0)


Step 2: Full Feature Engineering Function

In [9]:
def prepare_features(X, human_centroid, ai_centroid):
    """
    Extracts advanced features from (N, 100, 768) embeddings.
    Returns: pd.DataFrame of shape (N, num_features)
    """
    N = X.shape[0]
    
    # Feature 1: Mean pooling (N, 768)
    mean_pooled = np.mean(X, axis=1)  # shape: (N, 768)

    # Feature 2: Segment L2 norms (early, mid, late)
    early_norm = norm(X[:, :33, :], axis=2).mean(axis=1)
    mid_norm   = norm(X[:, 33:66, :], axis=2).mean(axis=1)
    late_norm  = norm(X[:, 66:, :], axis=2).mean(axis=1)

    # Feature 3: Positional entropy
    l2_norms = norm(X, axis=2)  # (N, 100)
    norm_probs = l2_norms / l2_norms.sum(axis=1, keepdims=True)
    entropy_vals = entropy(norm_probs + 1e-8, axis=1)  # prevent log(0)

    # Feature 4: Token-wise cosine similarity std (sharpness)
    cosine_sim = []
    for i in range(N):
        token_vecs = X[i]                   # (100, 768)
        mean_vec = mean_pooled[i]          # (768,)
        sims = cosine_similarity(token_vecs, mean_vec.reshape(1, -1)).flatten()
        cosine_sim.append(np.std(sims))    # sharpness = std of similarities
    sharpness = np.array(cosine_sim)

    # Feature 5: Cosine similarity to centroids
    sim_to_human = cosine_similarity(mean_pooled, human_centroid.reshape(1, -1)).flatten()
    sim_to_ai    = cosine_similarity(mean_pooled, ai_centroid.reshape(1, -1)).flatten()

    # Final DataFrame
    df = pd.DataFrame({
        'early_norm': early_norm,
        'mid_norm': mid_norm,
        'late_norm': late_norm,
        'entropy': entropy_vals,
        'sharpness': sharpness,
        'sim_to_human': sim_to_human,
        'sim_to_ai': sim_to_ai
    })

    # Add mean pooled vectors (768D)
    for i in range(mean_pooled.shape[1]):
        df[f'mean_{i}'] = mean_pooled[:, i]

    return df


In [10]:
X_train_feats = prepare_features(X_train, human_centroid, ai_centroid)
X_val_feats = prepare_features(X_val, human_centroid, ai_centroid)
X_test_feats = prepare_features(X_test, human_centroid, ai_centroid)


  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_{i}'] = mean_pooled[:, i]
  df[f'mean_

#step 3: Feature Scaling & Saving (Pre-Model)
Before we train models, we'll:

 Step 3.1: Scale the features (MinMaxScaler or StandardScaler)
Neural networks and SVMs require scaled inputs.

Tree-based models like XGBoost can optionally use raw features.

In [11]:
from sklearn.preprocessing import StandardScaler
import joblib

# Step 3.1: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_feats)
X_val_scaled   = scaler.transform(X_val_feats)
X_test_scaled  = scaler.transform(X_test_feats)

# Step 3.2: Save scaled features and scaler (optional)
np.save('X_train_scaled.npy', X_train_scaled)
np.save('X_val_scaled.npy', X_val_scaled)
np.save('X_test_scaled.npy', X_test_scaled)
np.save('y_train.npy', y_train)
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [25]:
# ==============================
#  FULL ADVANCED PREPROCESSING PIPELINE (MEMORY-EFFICIENT + ROBUST)
# ==============================

import numpy as np
import pandas as pd
import json, os
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import entropy, kurtosis
from numpy.linalg import norm
from sklearn.cluster import KMeans
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import trustworthiness
import umap

# === Utility ===
def mean_pool(X):
    return np.mean(X, axis=1)  # Mean pooling along token axis (100 → 768)

def load_jsonl_embeddings(path, expected_shape=(100, 768)):
    embeddings = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line)
                feats = np.array(item['features'])
                if feats.shape == expected_shape:
                    embeddings.append(feats)
            except Exception:
                continue
    return np.stack(embeddings) if embeddings else None  # Return stacked array or None

# === Feature Engineering ===
def prepare_features(X, human_centroid, ai_centroid):
    N, seq_len, dim = X.shape
    mean_embed = mean_pool(X)  # Mean embedding for each sample
    token_norms = norm(X, axis=2)  # L2 norm of each token

    # Basic statistical features on token norms
    df = pd.DataFrame({
        'avg_token_norm': token_norms.mean(axis=1),
        'std_token_norm': token_norms.std(axis=1),
        'entropy_norm': entropy((token_norms + 1e-8)/ (token_norms.sum(axis=1, keepdims=True)+1e-8), axis=1),
        'kurtosis_norm': kurtosis(token_norms, axis=1),
        'sim_to_human': cosine_similarity(mean_embed, human_centroid.reshape(1, -1)).ravel(),
        'sim_to_ai': cosine_similarity(mean_embed, ai_centroid.reshape(1, -1)).ravel()
    })
    df['sim_ratio'] = df['sim_to_ai'] / (df['sim_to_human'] + 1e-8)  # Ratio feature

    # PCA for semantic compression
    pca = PCA(n_components=50, random_state=42)
    df_pca = pd.DataFrame(pca.fit_transform(mean_embed), columns=[f'pca_{i}' for i in range(50)])
    df = pd.concat([df, df_pca], axis=1)

    # Clustering over semantic space
    cluster_model = KMeans(n_clusters=8, random_state=42, n_init='auto')
    df['cluster_id'] = cluster_model.fit_predict(mean_embed)

    # UMAP projection trustworthiness as a manifold quality score
    reducer = umap.UMAP(n_neighbors=15, n_components=2, random_state=None, n_jobs=-1)
    umap_embed = reducer.fit_transform(mean_embed)
    trust_scores = np.full(N, trustworthiness(mean_embed, umap_embed, n_neighbors=10))
    df['trustworthiness'] = trust_scores

    return df

# === Feature Selection ===
def feature_selection(df, y):
    selector = VarianceThreshold(threshold=0.01)  # Remove low-variance features
    df = pd.DataFrame(selector.fit_transform(df))
    top_k = SelectKBest(score_func=f_classif, k=min(100, df.shape[1]))  # Keep top features
    df = pd.DataFrame(top_k.fit_transform(df, y))
    return df

# === Main Pipeline ===
def full_preprocessing_pipeline(human_npy_path, ai_npy_path, val_jsonl_path=None, scaler_type='standard'):
    # Load data
    human_data = np.load(human_npy_path)
    ai_data = np.load(ai_npy_path)

    # Compute centroids for cosine-based features
    human_centroid = mean_pool(human_data).mean(axis=0)
    ai_centroid = mean_pool(ai_data).mean(axis=0)

    # Extract features
    human_feats = prepare_features(human_data, human_centroid, ai_centroid)
    ai_feats = prepare_features(ai_data, human_centroid, ai_centroid)

    # Combine and label data
    X_df = pd.concat([human_feats, ai_feats], axis=0).reset_index(drop=True)
    y = np.concatenate([np.zeros(len(human_feats)), np.ones(len(ai_feats))])

    # Select top features
    X_selected = feature_selection(X_df, y)

    # Scale features
    scaler = StandardScaler() if scaler_type == 'standard' else RobustScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Handle imbalance with SMOTE
    sm = SMOTE(random_state=42)
    X_balanced, y_balanced = sm.fit_resample(X_scaled, y)

    # Basic model for pseudo-labeling
    model = LogisticRegression()
    model.fit(X_balanced, y_balanced)

    # Pseudo-label validation set
    if val_jsonl_path:
        val_data = load_jsonl_embeddings(val_jsonl_path)
        if val_data is not None:
            val_feats = prepare_features(val_data, human_centroid, ai_centroid)
            val_selected = feature_selection(val_feats, model.predict(X_balanced))
            val_scaled = scaler.transform(val_selected)
            probs = model.predict_proba(val_scaled)
            confident = (probs.max(axis=1) > 0.98)  # High-confidence threshold
            X_pseudo = val_scaled[confident]
            y_pseudo = model.predict(val_scaled[confident])
            X_balanced = np.vstack([X_balanced, X_pseudo])
            y_balanced = np.hstack([y_balanced, y_pseudo])

    # Save final processed dataset
    os.makedirs("processed_final", exist_ok=True)
    np.save("processed_final/X_balanced.npy", X_balanced)
    np.save("processed_final/y_balanced.npy", y_balanced)

    return X_balanced, y_balanced


In [26]:
data = full_preprocessing_pipeline(
    human_npy_path='data/train/train_human.npy',
    ai_npy_path='data/train/train_ai.npy',
    val_jsonl_path='data/val/validation.jsonl',
   # test_jsonl_path='data/test/test_features.jsonl'
)
