In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import joblib

In [None]:
# Load dataset
df = pd.read_csv("/data/genomic_data_500.csv")


In [None]:
gene_enc = LabelEncoder()
geno_enc = LabelEncoder()
id_enc = LabelEncoder()
disease_enc = LabelEncoder()

In [None]:
df["GENE_ENC"] = gene_enc.fit_transform(df["GENE"])
df["GENOTYPE_ENC"] = geno_enc.fit_transform(df["GENOTYPE"])
df["ID_ENC"] = id_enc.fit_transform(df["ID"])
df["DISEASE_ENC"] = disease_enc.fit_transform(df["DISEASE"])


In [None]:
X = df[["CHROM", "POS", "GENE_ENC", "GENOTYPE_ENC", "ID_ENC"]]
y_class = df["DISEASE_ENC"]
y_reg = df["RISK_SCORE"]


In [None]:
X_train, X_test, y_train_cls, y_test_cls = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)
_, _, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# Classifier pipeline (scaling + tuned forest)
clf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        min_samples_split=4,
        random_state=42,
        class_weight="balanced"
    ))
])


In [None]:
# Regressor pipeline
reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=12,
        random_state=42
    ))
])


In [None]:
# Train models
clf_pipeline.fit(X_train, y_train_cls)
reg_pipeline.fit(X_train, y_train_reg)

# Evaluate
y_pred_cls = clf_pipeline.predict(X_test)
y_pred_reg = reg_pipeline.predict(X_test)

cls_acc = accuracy_score(y_test_cls, y_pred_cls)
reg_rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))

print(f"🎯 Classification Accuracy: {cls_acc:.3f}")
print(f"📈 Regression RMSE: {reg_rmse:.3f}")


In [None]:
# Save models and encoders
joblib.dump(clf_pipeline, "disease_classifier.pkl")
joblib.dump(reg_pipeline, "risk_regressor.pkl")
joblib.dump({
    "gene_enc": gene_enc,
    "geno_enc": geno_enc,
    "id_enc": id_enc,
    "disease_enc": disease_enc
}, "encoders.pkl")

print("✅ Models and encoders saved successfully!")
