In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
df = pd.read_csv("cleaned_genetic_disorder_data.csv")


In [3]:
# Split into features and target
X = df.drop(columns=["Genetic Disorder"])
y = df["Genetic Disorder"]

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation data
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f"✅ Model Accuracy: {accuracy:.2f}")
print("\n📊 Classification Report:\n", report)



✅ Model Accuracy: 0.57

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.85      0.70      2048
           1       0.42      0.21      0.28       409
           2       0.54      0.31      0.40      1531

    accuracy                           0.57      3988
   macro avg       0.52      0.45      0.46      3988
weighted avg       0.56      0.57      0.54      3988



In [4]:
# Save the trained model
joblib.dump(model, "genetic_disease_rf_model.pkl")
print("\n💾 Model saved as genetic_disease_rf_model.pkl")


💾 Model saved as genetic_disease_rf_model.pkl


In [6]:
import pandas as pd

# Load your cleaned training dataset
df = pd.read_csv("cleaned_train_data.csv")

# Display unique labels in the target column
print(df["Genetic Disorder"].value_counts())
print("\nUnique labels:", df["Genetic Disorder"].unique())


Genetic Disorder
0    10202
2     7664
1     2071
Name: count, dtype: int64

Unique labels: [0 1 2]
