# Method2 converted to Method1 structure

This notebook contains Method2 logic restructured to follow the Method1 cell layout.

**Notes:**
- GLCM features are computed first, then LBP, then intensity stats.
- Pipelines and GroupKFold usage are included as in Method1 structure.
- Run cells sequentially.


In [13]:
pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   --- ------------------------------------ 0.8/8.7 MB 4.6 MB/s eta 0:00:02
   -------- ------------------------------- 1.8/8.7 MB 4.9 MB/s eta 0:00:02
   -------------- ------------------------- 3.1/8.7 MB 5.6 MB/s eta 0:00:01
   -------------------- ------------------- 4.5/8.7 MB 5.8 MB/s eta 0:00:01
   -------------------------- ------------- 5.8/8.7 MB 5.8 MB/s eta 0:00:01
   ------------------------------- -------- 6.8/8.7 MB 6.0 MB/s eta 0:00:01
   ------------------------------------ --- 7.9/8.7 MB 5.8 MB/s eta 0:00:01
   --


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:

# Cell: imports
import os, math, time, pickle, json
from collections import Counter

import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

from scipy.stats import skew
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GroupKFold, GridSearchCV, GroupShuffleSplit
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix, classification_report


In [15]:

# Cell: Metadata & helpers
def read_metadata(meta_path):
    # robust whitespace-delimited read; uppercase column names
    df = pd.read_csv(meta_path, delim_whitespace=True, dtype=str)
    df.columns = [c.strip().upper() for c in df.columns]
    # try known label columns
    if 'SEVERITY' in df.columns:
        df['CANCER'] = df['SEVERITY'].map({'B':1, 'M':1}).fillna(0).astype(int)
    elif 'CLASS' in df.columns:
        df['CANCER'] = df['CLASS'].map({'B':1, 'M':1}).fillna(0).astype(int)
    else:
        # fallback: if a numeric label column exists, try to use it
        for col in ['LABEL', 'DIAGNOSIS', 'PATHOLOGY']:
            if col in df.columns:
                try:
                    df['CANCER'] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
                    break
                except Exception:
                    pass
        if 'CANCER' not in df.columns:
            raise ValueError("Cannot find SEVERITY/CLASS or equivalent label column in metadata.")
    # numeric conversions
    for c in ['RADIUS','X','Y']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
        else:
            df[c] = pd.NA
    # patient id inference from REFNUM (e.g., mdb001/mdb002 -> patient grouping)
    def _ref_to_patient(ref):
        try:
            s = str(ref)
            digits = ''.join(ch for ch in s if ch.isdigit())
            if digits == '':
                return None
            n = int(digits)
            return ((n-1)//2) + 1
        except Exception:
            return None
    if 'REFNUM' in df.columns:
        df['patient_id'] = df['REFNUM'].map(_ref_to_patient)
    else:
        df['patient_id'] = None
    return df

def preprocess_img(img):
    # CLAHE equalization; expects grayscale uint8 input
    if img is None:
        return None
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    return clahe.apply(img)

def roi_from_row(img_eq, row, image_size=1024, median_radius=48, min_side=32):
    # Return square ROI; if lesion coords available use them, else center crop using median_radius
    label = int(row['CANCER']) if 'CANCER' in row else 0
    x = row.get('X', None); y = row.get('Y', None); r = row.get('RADIUS', None)
    H, W = img_eq.shape
    if label == 1 and pd.notna(x) and pd.notna(y) and pd.notna(r):
        cx = int(x)
        cy = int(image_size - float(y))  # convert MIAS bottom-left -> top-left
        radius = int(r)
        x0 = max(0, cx-radius); x1 = min(W, cx+radius)
        y0 = max(0, cy-radius); y1 = min(H, cy+radius)
        roi = img_eq[y0:y1, x0:x1]
        if roi.size == 0:
            roi = img_eq.copy()
    else:
        radius = int(median_radius)
        cx, cy = W//2, H//2
        x0 = max(0, cx-radius); x1 = min(W, cx+radius)
        y0 = max(0, cy-radius); y1 = min(H, cy+radius)
        roi = img_eq[y0:y1, x0:x1]
    # pad if too small
    h,w = roi.shape
    if h < min_side or w < min_side:
        top = max(0, (min_side-h)//2); bottom = max(0, min_side-h-top)
        left = max(0, (min_side-w)//2); right = max(0, min_side-w-left)
        roi = cv2.copyMakeBorder(roi, top, bottom, left, right, cv2.BORDER_REFLECT)
    return roi


In [17]:

# Cell: Feature extraction functions (GLCM first, then LBP, then intensity stats)
def compute_glcm_features(roi, distances=(1,), angles=(0, np.pi/4, np.pi/2, 3*np.pi/4), levels=32):
    roi_q = (roi / (256.0/levels)).astype(np.uint8)
    glcm = graycomatrix(roi_q, distances=list(distances), angles=list(angles), levels=levels, symmetric=True, normed=True)
    props = ['contrast','dissimilarity','homogeneity','energy','correlation']
    feats = [float(graycoprops(glcm, p).mean()) for p in props]
    return np.array(feats, dtype=float)

def compute_lbp_hist(roi, P=8, radii=(1,3), n_bins=59):
    feats = []
    for R in radii:
        lbp = local_binary_pattern(roi, P, R, method='uniform')
        hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
        s = hist.sum()
        if s == 0:
            feats.extend([0.0]*n_bins)
        else:
            feats.extend((hist.astype(float)/s).tolist())
    return np.array(feats, dtype=float)

def intensity_stats(roi):
    return np.array([float(roi.mean()), float(roi.std()), float(roi.min()), float(roi.max())], dtype=float)


In [18]:

# Cell: convert helper and sample plotting utility
def convert(o):
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, (np.ndarray,)):
        return o.tolist()
    return str(o)

def plot_sample_rois(sample_rois, outpath):
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(10,4))
    for idx, (ref, lab, grp, roi_img) in enumerate(sample_rois):
        ax = fig.add_subplot(2,3, idx+1)
        ax.imshow(roi_img, cmap='gray'); ax.set_title(f"{ref} L={lab} G={grp}"); ax.axis('off')
    fig.tight_layout()
    fig.savefig(outpath)
    plt.close(fig)


In [19]:

# Cell: Feature extraction loop (run this cell to extract features and save features.pkl)
# Edit args below for paths and sizes if needed (these match Method1 CLI names)
class Args:
    images = "all-mias"
    meta = "data2.txt"
    outdir = "results_method2"
    target_size = 128
    min_side = 32
    image_size = 1024
    P = 8
    lbp_radii = [1,3]
    lbp_bins = 59
    glcm_distances = [1]
    glcm_levels = 32
    select_k = 100
    rf_estimators = 300
    n_permutations = 200
    perm_test_holdout_fraction = 0.2
    n_jobs = -1
    random_state = 42

args = Args()

os.makedirs(args.outdir, exist_ok=True)

df = read_metadata(args.meta)
print("[INFO] Metadata rows:", len(df))
radii = pd.to_numeric(df['RADIUS'], errors='coerce').dropna()
median_radius = int(radii.median()) if radii.size>0 else 48
print("[INFO] median_radius:", median_radius)

images = sorted([f for f in os.listdir(args.images) if f.lower().endswith('.pgm')])
print("[INFO] Found images:", len(images))

features = []
labels = []
groups = []
sample_rois = []

for fname in images:
    ref = os.path.splitext(fname)[0]
    row = df[df['REFNUM'] == ref]
    if row.empty:
        # debug info: no metadata row
        # print(f"[DEBUG] No metadata for {ref}")
        continue
    row = row.iloc[0]
    img_path = os.path.join(args.images, fname)
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f"[WARN] cannot read {img_path}, skipping")
        continue
    img_eq = preprocess_img(img)
    if img_eq is None:
        print(f"[WARN] preprocessed image too small: {img_path}")
        continue
    roi = roi_from_row(img_eq, row, image_size=args.image_size, median_radius=median_radius, min_side=args.min_side)
    roi_resized = cv2.resize(roi, (args.target_size, args.target_size), interpolation=cv2.INTER_AREA)
    # compute features: GLCM first, then LBP, then intensity
    glcm_feats = compute_glcm_features(roi_resized, distances=tuple(args.glcm_distances), angles=(0, math.pi/4, math.pi/2, 3*math.pi/4), levels=args.glcm_levels)
    lbp_feats = compute_lbp_hist(roi_resized, P=args.P, radii=tuple(args.lbp_radii), n_bins=args.lbp_bins)
    int_feats = intensity_stats(roi_resized)
    feat_vec = np.concatenate([glcm_feats, lbp_feats, int_feats]).astype(float)
    features.append(feat_vec)
    labels.append(int(row['CANCER']))
    groups.append(row['patient_id'])
    if len(sample_rois) < 6:
        sample_rois.append((ref, int(row['CANCER']), row['patient_id'], roi_resized))

X = np.vstack(features) if len(features)>0 else np.zeros((0, args.lbp_bins*len(args.lbp_radii) + 5))
y = np.array(labels, dtype=int)
groups_arr = np.array(groups)
print("[INFO] Extracted features shape:", X.shape)
print("Label distribution:", Counter(y))

with open(os.path.join(args.outdir, "features.pkl"), "wb") as f:
    pickle.dump({"X": X, "y": y, "groups": groups_arr, "cfg": vars(args)}, f)
print("[INFO] Saved features.pkl")

# save sample rois preview
plot_sample_rois(sample_rois, os.path.join(args.outdir, "sample_rois.png"))
print("[INFO] Saved sample_rois.png")


  df = pd.read_csv(meta_path, delim_whitespace=True, dtype=str)


[INFO] Metadata rows: 324
[INFO] median_radius: 43
[INFO] Found images: 324
[INFO] Extracted features shape: (324, 127)
Label distribution: Counter({np.int64(0): 207, np.int64(1): 117})
[INFO] Saved features.pkl
[INFO] Saved sample_rois.png


In [20]:

# Cell: Training, nested group-aware CV and final model fit
# Assumes features.pkl produced by previous cell
with open(os.path.join(args.outdir, "features.pkl"), "rb") as f:
    dd = pickle.load(f)
X = dd['X']; y = dd['y']; groups_arr = dd['groups']

n_groups = len(set([g for g in groups_arr if g is not None]))
n_splits = min(5, max(2, n_groups))
print("[INFO] Running GroupKFold with n_splits =", n_splits)

outer_cv = GroupKFold(n_splits=n_splits)
inner_cv = GroupKFold(n_splits=max(2, n_splits-1))

selector = SelectKBest(mutual_info_classif, k=min(args.select_k, X.shape[1]))
scaler = StandardScaler()
rf_clf = RandomForestClassifier(n_estimators=args.rf_estimators, class_weight='balanced', random_state=args.random_state)
svm_clf = SVC(probability=True, class_weight='balanced', random_state=args.random_state)

pipelines = {
    'rf': Pipeline([('scaler', scaler), ('select', selector), ('clf', rf_clf)]),
    'svm': Pipeline([('scaler', scaler), ('select', selector), ('clf', svm_clf)])
}

rf_grid = {'clf__n_estimators': [args.rf_estimators]}
svm_grid = {'clf__C': [1.0], 'clf__gamma': ['scale']}

results = {k: [] for k in pipelines.keys()}
best_models = {}

fold = 0
for train_idx, test_idx in outer_cv.split(X, y, groups_arr):
    fold += 1
    Xtr, Xte = X[train_idx], X[test_idx]
    ytr, yte = y[train_idx], y[test_idx]
    gtr, gte = groups_arr[train_idx], groups_arr[test_idx]
    print(f"[INFO] Outer fold {fold}: train={len(train_idx)} test={len(test_idx)}")

    for name, pipe in pipelines.items():
        param_grid = rf_grid if name=='rf' else svm_grid
        gs = GridSearchCV(pipe, param_grid, cv=inner_cv, scoring='roc_auc', n_jobs=args.n_jobs, refit=True)
        gs.fit(Xtr, ytr, groups=gtr)
        best = gs.best_estimator_
        ypred = best.predict(Xte)
        yprob = best.predict_proba(Xte)[:,1] if hasattr(best, "predict_proba") else best.decision_function(Xte)
        acc = accuracy_score(yte, ypred)
        rec = recall_score(yte, ypred, zero_division=0)
        auc = roc_auc_score(yte, yprob) if len(np.unique(yte))>1 else float('nan')
        cm = confusion_matrix(yte, ypred)
        results[name].append({'fold': fold, 'acc': acc, 'recall': rec, 'auc': auc, 'cm': cm, 'best_params': gs.best_params_})
        best_models[name] = gs.best_estimator_
        print(f"[FOLD {fold}][{name}] acc={acc:.3f} rec={rec:.3f} auc={auc:.3f}")

# summarize and save report
summary = {'n_samples': int(X.shape[0]), 'n_features': int(X.shape[1]), 'label_counts': dict(pd.Series(y).value_counts()), 'unique_groups': int(len(set(groups_arr)))}
for name in results:
    arr_acc = np.array([r['acc'] for r in results[name]])
    arr_rec = np.array([r['recall'] for r in results[name]])
    arr_auc = np.array([r['auc'] for r in results[name] if not np.isnan(r['auc'])])
    summary[f'cv_{name}'] = {'mean_acc': float(np.nanmean(arr_acc)), 'std_acc': float(np.nanstd(arr_acc)), 'mean_rec': float(np.nanmean(arr_rec)), 'std_rec': float(np.nanstd(arr_rec)), 'mean_auc': float(np.nanmean(arr_auc)) if arr_auc.size>0 else None}

# refit final models on full data and save
for name, model in best_models.items():
    try:
        model.fit(X, y)
        with open(os.path.join(args.outdir, f"final_model_{name}.pkl"), "wb") as f:
            pickle.dump(model, f)
    except Exception as e:
        print("Could not refit/save", name, e)

with open(os.path.join(args.outdir, "report.json"), "w") as f:
    json.dump(summary, f, indent=2, default=convert)

print("[INFO] saved report.json and final models")
print(json.dumps(summary, indent=2, default=convert))


[INFO] Running GroupKFold with n_splits = 5
[INFO] Outer fold 1: train=258 test=66
[FOLD 1][rf] acc=0.909 rec=0.739 auc=0.935
[FOLD 1][svm] acc=0.909 rec=0.739 auc=0.913
[INFO] Outer fold 2: train=258 test=66
[FOLD 2][rf] acc=0.894 rec=0.750 auc=0.928
[FOLD 2][svm] acc=0.879 rec=0.714 auc=0.896
[INFO] Outer fold 3: train=260 test=64
[FOLD 3][rf] acc=0.859 rec=0.667 auc=0.965
[FOLD 3][svm] acc=0.891 rec=0.708 auc=0.931
[INFO] Outer fold 4: train=260 test=64
[FOLD 4][rf] acc=0.969 rec=0.895 auc=0.953
[FOLD 4][svm] acc=0.922 rec=0.789 auc=0.958
[INFO] Outer fold 5: train=260 test=64
[FOLD 5][rf] acc=0.891 rec=0.739 auc=0.918
[FOLD 5][svm] acc=0.844 rec=0.739 auc=0.870
[INFO] saved report.json and final models
{
  "n_samples": 324,
  "n_features": 127,
  "label_counts": {
    "0": 207,
    "1": 117
  },
  "unique_groups": 162,
  "cv_rf": {
    "mean_acc": 0.9043560606060606,
    "std_acc": 0.03602594344053369,
    "mean_rec": 0.7579328756674295,
    "std_rec": 0.07458530716073254,
    "mea

In [21]:

# Cell: Permutation testing (group-wise). This uses a held-out group split.
with open(os.path.join(args.outdir, "features.pkl"), "rb") as f:
    dd = pickle.load(f)
X = dd['X']; y = dd['y']; groups_arr = dd['groups']

gss = GroupShuffleSplit(n_splits=1, test_size=args.perm_test_holdout_fraction, random_state=args.random_state)
train_idx_perm, test_idx_perm = next(gss.split(X, y, groups_arr))
Xtr_perm, Xte_perm = X[train_idx_perm], X[test_idx_perm]
ytr_perm, yte_perm = y[train_idx_perm], y[test_idx_perm]
gtr_perm, gte_perm = groups_arr[train_idx_perm], groups_arr[test_idx_perm]

# Load final RF (if saved) or train fresh
rf_path = os.path.join(args.outdir, "final_model_rf.pkl")
if os.path.exists(rf_path):
    with open(rf_path, "rb") as f:
        rf_model = pickle.load(f)
else:
    rf_model = RandomForestClassifier(n_estimators=args.rf_estimators, class_weight='balanced', random_state=args.random_state)
    rf_model.fit(Xtr_perm, ytr_perm)

if hasattr(rf_model, "predict_proba"):
    yprob = rf_model.predict_proba(Xte_perm)[:,1]
else:
    yprob = rf_model.decision_function(Xte_perm)

real_auc = roc_auc_score(yte_perm, yprob) if len(np.unique(yte_perm))>1 else float('nan')
perm_scores = []
n_perm = args.n_permutations
for i in range(n_perm):
    # permute labels at the group level
    perm_groups = np.unique(gtr_perm)
    perm_map = np.random.permutation(perm_groups)
    group_to_label = {g: int(pd.Series(ytr_perm[gtr_perm==g]).mode().iloc[0]) for g in perm_groups}
    # create a permuted label vector for training by shuffling group labels
    permuted_group_labels = np.random.permutation([group_to_label[g] for g in perm_groups])
    ytr_p = ytr_perm.copy()
    for g, lab in zip(perm_groups, permuted_group_labels):
        ytr_p[gtr_perm==g] = lab
    # fit clone
    rf_clone = RandomForestClassifier(n_estimators=args.rf_estimators, class_weight='balanced', random_state=args.random_state)
    rf_clone.fit(Xtr_perm, ytr_p)
    try:
        prob_p = rf_clone.predict_proba(Xte_perm)[:,1]
        auc_p = roc_auc_score(yte_perm, prob_p) if len(np.unique(yte_perm))>1 else float('nan')
    except:
        auc_p = float('nan')
    perm_scores.append(auc_p)

pvalue = (np.sum(np.array(perm_scores) >= real_auc) + 1) / (len(perm_scores) + 1)
print("[INFO] Permutation real_auc:", real_auc, "pvalue:", pvalue)
# Save sample of permutation scores
with open(os.path.join(args.outdir, "perm_scores_sample.pkl"), "wb") as f:
    pickle.dump(perm_scores[:min(1000,len(perm_scores))], f)
print("[INFO] saved perm_scores_sample.pkl")


[INFO] Permutation real_auc: 1.0 pvalue: 0.004975124378109453
[INFO] saved perm_scores_sample.pkl


In [22]:
# --- Held-out evaluation (robust) ---
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Ensure X, y, groups (or groups_arr) exist in the notebook
# Use existing variable names if different (e.g., groups_arr)
try:
    # prefer existing groups_arr if present
    groups_arr = globals().get('groups_arr', globals().get('groups', None))
    if groups_arr is None:
        raise NameError("No 'groups' or 'groups_arr' variable found. Please run feature extraction cell first.")
    # convert to numpy array if needed
    if isinstance(groups_arr, list):
        groups_arr = np.array(groups_arr)
    elif hasattr(groups_arr, "values"):  # pandas Series
        groups_arr = groups_arr.values
except Exception as e:
    raise RuntimeError(f"Error preparing groups array: {e}")

print(f"[INFO] groups_arr type: {type(groups_arr)}, shape: {getattr(groups_arr,'shape',None)}")

# Create a patient-wise held-out split
gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups_arr))

Xtr, Xte = X[train_idx], X[test_idx]
ytr, yte = y[train_idx], y[test_idx]
gtr, gte = groups_arr[train_idx], groups_arr[test_idx]

print(f"[INFO] Held-out train samples: {len(train_idx)}, test samples: {len(test_idx)}")
print(f"[INFO] Unique train groups: {len(set(gtr))}, Unique test groups: {len(set(gte))}")

# Select a model from best_models (robust)
chosen_model = None
if 'best_models' in globals():
    bm = globals()['best_models']
    # dict (e.g., {'rf': model, 'svm': model})
    if isinstance(bm, dict):
        if 'rf' in bm:
            chosen_model = bm['rf']
            print("[INFO] Selected best_models['rf']")
        else:
            # pick first model in dict
            first_key = next(iter(bm.keys()))
            chosen_model = bm[first_key]
            print(f"[INFO] Selected best_models['{first_key}']")
    elif isinstance(bm, list):
        if len(bm) > 0:
            chosen_model = bm[-1]
            print("[INFO] Selected last model from best_models list")
# If nothing found, fallback to fresh RF
if chosen_model is None:
    print("[WARN] best_models not found or empty — falling back to a fresh RandomForest.")
    chosen_model = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42)

# Fit chosen model on training partition (refit to be safe)
chosen_model.fit(Xtr, ytr)

# Evaluate
ypred = chosen_model.predict(Xte)
if hasattr(chosen_model, "predict_proba"):
    yprob = chosen_model.predict_proba(Xte)[:,1]
elif hasattr(chosen_model, "decision_function"):
    yprob = chosen_model.decision_function(Xte)
else:
    yprob = None

acc = accuracy_score(yte, ypred)
rec = recall_score(yte, ypred, zero_division=0)
cm = confusion_matrix(yte, ypred)
auc = float('nan')
if yprob is not None and len(np.unique(yte)) > 1:
    try:
        auc = roc_auc_score(yte, yprob)
    except Exception as e:
        print("[WARN] Could not compute AUC:", e)

print("\n=== Held-out evaluation: Selected Model ===")
print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"AUC: {auc}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(yte, ypred))


[INFO] groups_arr type: <class 'numpy.ndarray'>, shape: (324,)
[INFO] Held-out train samples: 258, test samples: 66
[INFO] Unique train groups: 129, Unique test groups: 33
[INFO] Selected best_models['rf']

=== Held-out evaluation: Selected Model ===
Accuracy: 0.8788
Recall: 0.7917
AUC: 0.9330357142857143
Confusion Matrix:
 [[39  3]
 [ 5 19]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91        42
           1       0.86      0.79      0.83        24

    accuracy                           0.88        66
   macro avg       0.88      0.86      0.87        66
weighted avg       0.88      0.88      0.88        66



In [23]:
print(type(groups_arr), getattr(groups_arr,'shape',None))
print("train_idx type:", type(train_idx), "len:", len(train_idx))


<class 'numpy.ndarray'> (324,)
train_idx type: <class 'numpy.ndarray'> len: 258
