<a href="https://colab.research.google.com/github/Dikchik9100/genome/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# leukemia_logreg_openml.py
import openml
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import sys
import warnings
warnings.filterwarnings("ignore")

def fetch_leukemia_dataset(min_instances=60, min_features=1000):
    """
    Search OpenML for a leukemia-like gene-expression dataset (binary) and load it.
    Returns: X (DataFrame), y (Series), ds_meta (OpenML dataset object)
    """
    print("Searching OpenML for candidate leukemia datasets (this may take a few seconds)...")
    datasets = openml.datasets.list_datasets(output_format="dataframe")

    # Filter heuristics: name contains 'leukemia', binary class, enough instances and many features
    candidates = datasets[
        datasets['name'].str.contains('leukemia', case=False, na=False)
        & (datasets['NumberOfClasses'] == 2)
        & (datasets['NumberOfInstances'] >= min_instances)
        & (datasets['NumberOfFeatures'] >= min_features)
    ].sort_values(by='NumberOfInstances', ascending=False)

    if candidates.empty:
        raise RuntimeError(
            "No suitable candidate found on OpenML with those filters. "
            "You can relax min_instances/min_features or search manually on openml.org."
        )

    did = int(candidates.iloc[0]['did'])
    ds = openml.datasets.get_dataset(did)
    print(f"Selected dataset: {candidates.iloc[0]['name']} (did={did})")

    # ds.get_data may return (X, y, categorical_indicator, attribute_names)
    X, y, categorical_indicator, attribute_names = ds.get_data(target=ds.default_target_attribute)

    # Convert X to DataFrame with appropriate columns
    # attribute_names sometimes includes the target name at the end â€” guard against off-by-one
    if isinstance(X, pd.DataFrame):
        X_df = X.copy()
    else:
        # attribute_names might be None/incorrect: construct range names if needed
        if attribute_names is None:
            cols = [f"f{i}" for i in range(X.shape[1])]
        else:
            # If attribute_names length equals X.shape[1] + 1, drop last one (assumed target)
            if len(attribute_names) == X.shape[1] + 1:
                cols = attribute_names[:-1]
            else:
                cols = attribute_names
        X_df = pd.DataFrame(X, columns=cols)

    # Convert y to Series and encode labels (e.g., 'ALL'/'AML' -> 0/1)
    y_ser = pd.Series(y, name='target')
    if y_ser.dtype == object or y_ser.dtype.name == 'category':
        le = LabelEncoder()
        y_enc = pd.Series(le.fit_transform(y_ser), name='target')
        class_labels = le.classes_
    else:
        y_enc = y_ser.astype(int)
        class_labels = np.unique(y_enc)

    # Ensure numeric features (coerce non-numeric to numeric, drop columns that are all NaN)
    X_df = X_df.apply(pd.to_numeric, errors='coerce')
    nan_cols = X_df.columns[X_df.isna().all()].tolist()
    if nan_cols:
        print(f"Warning: dropping {len(nan_cols)} columns that are entirely NaN.")
        X_df = X_df.drop(columns=nan_cols)

    # Drop rows with any NaNs (rare for proper gene-expression datasets)
    rows_before = X_df.shape[0]
    merged = pd.concat([X_df, y_enc], axis=1).dropna()
    if merged.shape[0] < rows_before:
        print(f"Dropping {rows_before - merged.shape[0]} rows due to NaNs.")
    X_df = merged.drop(columns=['target'])
    y_enc = merged['target']

    return X_df, y_enc, ds

def train_and_evaluate(X, y, random_state=42):
    pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('clf', LogisticRegression(
            penalty='l2',
            solver='liblinear',    # liblinear works well for small datasets
            max_iter=5000,
            class_weight='balanced',
            random_state=random_state
        ))
    ])

    print(f"Data shape: samples={X.shape[0]}, features={X.shape[1]}")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    print("Running 5-fold stratified cross-validation...")
    cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    print(f"CV accuracy: mean={cv_scores.mean():.4f}, std={cv_scores.std():.4f}")

    # Holdout evaluation
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, stratify=y, random_state=random_state
    )
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print("\nHoldout accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification report:\n", classification_report(y_test, y_pred))

    return pipeline, X.columns.tolist(), np.unique(y)

def predict_for_person(model_pipeline, feature_names, class_labels, person_features_dict):
    """
    person_features_dict: {feature_name: value, ...}
    Missing features are filled with 0. Returns (pred_label, prob_dict or None)
    """
    # Build DataFrame with a single row
    x_row = pd.DataFrame(np.zeros((1, len(feature_names))), columns=feature_names)
    for k, v in person_features_dict.items():
        if k in x_row.columns:
            try:
                x_row.at[0, k] = float(v)
            except Exception:
                raise ValueError(f"Value for feature {k} is not numeric: {v}")
        else:
            print(f"Warning: feature '{k}' not found in model feature list; ignoring it.")

    pred = model_pipeline.predict(x_row)[0]
    proba = None
    if hasattr(model_pipeline, "predict_proba"):
        p_arr = model_pipeline.predict_proba(x_row)[0]
        proba = {str(class_labels[i]): float(p_arr[i]) for i in range(len(class_labels))}
    return pred, proba

if __name__ == "__main__":
    try:
        print("Downloading leukemia dataset from OpenML...")
        X, y, ds_meta = fetch_leukemia_dataset()
    except Exception as e:
        print("Failed to fetch dataset from OpenML:", str(e))
        print("If you are behind a firewall or OpenML is unavailable, please ensure internet access or provide a local CSV.")
        sys.exit(1)

    print(f"\nLoaded dataset: {ds_meta.name} (did={ds_meta.dataset_id}).")
    model, feature_names, class_labels = train_and_evaluate(X, y)

    # Example prediction: set a few probe values; missing probes default to 0
    example_input = {
        feature_names[0]: 0.5,
        feature_names[1]: -1.2,
        feature_names[2]: 2.3
    }
    pred_label, proba = predict_for_person(model, feature_names, class_labels, example_input)
    print("\nPredicted disease class (encoded):", pred_label)
    if proba is not None:
        print("Class probabilities:", proba)


Downloading leukemia dataset from OpenML...
Searching OpenML for candidate leukemia datasets (this may take a few seconds)...
Selected dataset: leukemia (did=1104)

Loaded dataset: leukemia (did=1104).
Data shape: samples=72, features=7129
Running 5-fold stratified cross-validation...
CV accuracy: mean=0.9448, std=0.0504

Holdout accuracy: 0.8333333333333334

Classification report:
               precision    recall  f1-score   support

           0       0.91      0.83      0.87        12
           1       0.71      0.83      0.77         6

    accuracy                           0.83        18
   macro avg       0.81      0.83      0.82        18
weighted avg       0.84      0.83      0.84        18


Predicted disease class (encoded): 0
Class probabilities: {'0': 0.9999906630663873, '1': 9.33693361266593e-06}
