In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv(r'C:\Users\falih\Documents\Coding\Python\PASD_KostPutraMR\smoking_drinkin_100k.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle

# --- Feature Engineering: Define Disease Risk Columns ---

# 1. Hypertension Risk
# Hypertension is defined as SBP >= 140 or DBP >= 90
df['Hypertension_Risk'] = ((df['SBP'] >= 140) | (df['DBP'] >= 90)).astype(int)

# 2. Diabetes Risk
# Diabetes is defined as BLDS (Fasting Plasma Glucose) >= 126 mg/dL
df['Diabetes_Risk'] = (df['BLDS'] >= 126).astype(int)

# 3. High Cholesterol Risk (Dyslipidemia)
# Define thresholds for cholesterol and triglycerides
# Total Cholesterol >= 193 mg/dL OR LDL >= 116 mg/dL OR Triglyceride >= 150 mg/dL
# OR HDL < 40 for men, < 50 for women

df['High_Cholesterol_Risk'] = 0
df.loc[(df['tot_chole'] >= 193) |
       (df['LDL_chole'] >= 116) |
       (df['triglyceride'] >= 150), 'High_Cholesterol_Risk'] = 1

# Add HDL cholesterol risk based on sex
df.loc[(df['sex'] == 'Male') & (df['HDL_chole'] < 40), 'High_Cholesterol_Risk'] = 1
df.loc[(df['sex'] == 'Female') & (df['HDL_chole'] < 50), 'High_Cholesterol_Risk'] = 1

# 4. Anemia Risk
# Anemia is defined as Hemoglobin < 13 g/dL for males and < 12 g/dL for females
df['Anemia_Risk'] = 0
df.loc[(df['sex'] == 'Male') & (df['hemoglobin'] < 13), 'Anemia_Risk'] = 1
df.loc[(df['sex'] == 'Female') & (df['hemoglobin'] < 12), 'Anemia_Risk'] = 1


# 5. Fatty Liver Risk (NAFLD Risk based on metabolic risk factors and liver enzymes)
# Calculate BMI first
df['BMI'] = df['weight'] / ((df['height'] / 100)**2)

# Define conditions for NAFLD Risk. Using a simplified approach here based on available data.
# High liver enzymes (SGOT_AST > 40 or SGOT_ALT > 40 or gamma_GTP > 60 - general elevated levels) AND
# At least two metabolic risk factors:
# - Obesity (BMI >= 30) OR Overweight (BMI >= 25)
# - Waist circumference >= 94 cm for men, >= 80 cm for women
# - Hypertension (already defined)
# - Diabetes (already defined)
# - Dyslipidemia (already defined)

# For liver enzymes, I'll use common general upper limits if not specified by WHO in the search results.
# Typical upper limits: AST/ALT around 40 U/L, GGT around 60 U/L (these can vary by lab, but for general risk)
df['Fatty_Liver_Risk'] = 0

# Check for elevated liver enzymes
elevated_liver_enzymes = (df['SGOT_AST'] > 40) | (df['SGOT_ALT'] > 40) | (df['gamma_GTP'] > 60)

# Check for metabolic risk factors
df['Obesity_Risk'] = (df['BMI'] >= 30).astype(int)
df['Overweight_Risk'] = (df['BMI'] >= 25).astype(int)

df['High_Waistline_Risk'] = 0
df.loc[(df['sex'] == 'Male') & (df['waistline'] >= 94), 'High_Waistline_Risk'] = 1
df.loc[(df['sex'] == 'Female') & (df['waistline'] >= 80), 'High_Waistline_Risk'] = 1

# Combine metabolic risk factors
df['Metabolic_Risk_Count'] = (df['Obesity_Risk'] | df['Overweight_Risk']).astype(int) + \
                             df['High_Waistline_Risk'] + \
                             df['Hypertension_Risk'] + \
                             df['Diabetes_Risk'] + \
                             df['High_Cholesterol_Risk']

# A simple rule for NAFLD Risk: Elevated liver enzymes AND at least 2 metabolic risk factors
df.loc[(elevated_liver_enzymes) & (df['Metabolic_Risk_Count'] >= 2), 'Fatty_Liver_Risk'] = 1


# --- Data Preprocessing ---
# Encode categorical features ('sex', 'drinking')
df['sex'] = df['sex'].map({'Male': 0, 'Female': 1}) # Male: 0, Female: 1
df['drinking'] = df['drinking'].map({'N': 0, 'Y': 1}) # N: 0, Y: 1

# Drop the temporarily created metabolic risk columns and BMI
df = df.drop(columns=['BMI', 'Obesity_Risk', 'Overweight_Risk', 'High_Waistline_Risk', 'Metabolic_Risk_Count'])

# Define features (X) and target (y) for each disease risk
feature_columns = [col for col in df.columns if col not in ['Hypertension_Risk', 'Diabetes_Risk', 'High_Cholesterol_Risk', 'Anemia_Risk', 'Fatty_Liver_Risk']]

disease_risks = {
    'Hypertension_Risk': 'hypertension_risk_model.pkl',
    'Diabetes_Risk': 'diabetes_risk_model.pkl',
    'High_Cholesterol_Risk': 'high_cholesterol_risk_model.pkl',
    'Anemia_Risk': 'anemia_risk_model.pkl',
    'Fatty_Liver_Risk': 'fatty_liver_risk_model.pkl'
}

# Train and save a model for each disease risk
for disease, model_filename in disease_risks.items():
    X = df[feature_columns]
    y = df[disease]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Initialize and train the RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model (optional, for internal check)
    y_pred = model.predict(X_test)
    print(f"--- {disease} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))

    # Save the trained model as a .pkl file
    with open(model_filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"Model for {disease} saved as {model_filename}")


--- Hypertension_Risk ---
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     26185
           1       1.00      1.00      1.00      3815

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000

Model for Hypertension_Risk saved as hypertension_risk_model.pkl
--- Diabetes_Risk ---
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27672
           1       1.00      1.00      1.00      2328

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000

Model for Diabetes_Risk saved as diabetes_risk_model.pkl
--- High_Cholesterol_Risk ---
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1