<a href="https://colab.research.google.com/github/Dikchik9100/genome/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install scikit-learn openml pandas numpy

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-1.0.2-py3-none-any.whl.metadata (15 kB)
Collecting minio (from openml)
  Downloading minio-7.2.18-py3-none-any.whl.metadata (6.5 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.18-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-1.0.2-py3-none-any.whl (13 kB)
Downloading pycryptodome-3.23

In [2]:

import openml
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

def fetch_leukemia_dataset():
    # Find a leukemia gene-expression dataset (binary ALL vs AML)
    datasets = openml.datasets.list_datasets(output_format="dataframe")
    candidates = datasets[
        datasets['name'].str.contains('leukemia', case=False, na=False)
        & (datasets['NumberOfClasses'] == 2)
        & (datasets['NumberOfInstances'] >= 60)
        & (datasets['NumberOfFeatures'] > 1000)  # gene-expression typically thousands of probes
    ].sort_values(by='NumberOfInstances', ascending=False)

    if candidates.empty:
        raise RuntimeError("Could not find a suitable leukemia dataset on OpenML.")

    did = int(candidates.iloc[0]['did'])
    ds = openml.datasets.get_dataset(did)
    X, y, categorical_indicator, attribute_names = ds.get_data(target=ds.default_target_attribute)

    X = pd.DataFrame(X, columns=attribute_names[:-1] if len(attribute_names)==X.shape[1]+1 else attribute_names)
    y = pd.Series(y, name='target')
    return X, y, ds

def train_and_evaluate(X, y):
    # Many gene-expression datasets are high-dimensional and low-sample
    # A simple, strong baseline: StandardScaler + L2 Logistic Regression
    pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('clf', LogisticRegression(
            penalty='l2',
            solver='liblinear',
            max_iter=5000,
            class_weight='balanced',  # handle any imbalance
            random_state=42
        ))
    ])

    # Stratified CV for robust estimate
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    print(f"CV accuracy: mean={cv_scores.mean():.3f}, std={cv_scores.std():.3f}")

    # Fit final model on train split for a report and prediction demo
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, stratify=y, random_state=42
    )
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print("\nHoldout accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification report:\n", classification_report(y_test, y_pred))

    return pipeline, X_train.columns.tolist(), np.unique(y)

def predict_for_person(model, feature_names, class_labels, person_features_dict):
    # person_features_dict: dict mapping feature_name -> value
    # Any missing features will be filled with 0; ensure names align
    x_row = np.zeros((1, len(feature_names)), dtype=float)
    name_to_idx = {n: i for i, n in enumerate(feature_names)}
    for k, v in person_features_dict.items():
        if k in name_to_idx:
            x_row[0, name_to_idx[k]] = float(v)

    pred = model.predict(x_row)[0]
    proba = None
    if hasattr(model, "predict_proba"):
        proba_arr = model.predict_proba(x_row)[0]
        proba = {str(class_labels[i]): float(proba_arr[i]) for i in range(len(class_labels))}

    return pred, proba

if __name__ == "__main__":
    print("Downloading leukemia dataset from OpenML...")
    X, y, ds_meta = fetch_leukemia_dataset()
    print(f"Dataset: {ds_meta.name} (did={ds_meta.dataset_id}), samples={X.shape[0]}, features={X.shape[1]}")

    model, feature_names, class_labels = train_and_evaluate(X, y)

    # Example: predict for a (synthetic) person by setting a few probe values.
    # In practice, you'd map this person's measured gene-expression values to these exact feature names.
    example_input = {
        # "gene_or_probe_name": value,
        # Only a few shown; absent ones default to 0 in this demo
        feature_names[0]: 0.5,
        feature_names[1]: -1.2,
        feature_names[2]: 2.3
    }
    pred_label, proba = predict_for_person(model, feature_names, class_labels, example_input)
    print("\nPredicted disease class:", pred_label)
    if proba is not None:
        print("Class probabilities:", proba)

Downloading leukemia dataset from OpenML...
Dataset: leukemia (did=1104), samples=72, features=7129
CV accuracy: mean=0.945, std=0.050

Holdout accuracy: 0.8333333333333334

Classification report:
               precision    recall  f1-score   support

         ALL       0.91      0.83      0.87        12
         AML       0.71      0.83      0.77         6

    accuracy                           0.83        18
   macro avg       0.81      0.83      0.82        18
weighted avg       0.84      0.83      0.84        18


Predicted disease class: ALL
Class probabilities: {'ALL': 0.9999906630663873, 'AML': 9.33693361266593e-06}


