In [257]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pickle

# Load the CSV
df = pd.read_csv(r'C:/Users/Amani/FASTAPI+ML/patients_data.csv')
df_feat = df.copy()
print(df.head(10))
df_feat.info()

# Feature 1: BMI (correct height to meters)
df_feat["bmi"] = df_feat["weight_kg"] / ((df_feat["height_cm"] / 100) ** 2)

# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

df_feat["age_group"] = df_feat["age"].apply(age_group)

# Feature 3: Lifestyle Risk (using BMI, smoker, and condition with NaN handling)
def lifestyle_risk(row):
    condition = str(row["condition"]).lower() if pd.notna(row["condition"]) else ""
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["bmi"] > 27 or "diabetes" in condition:
        return "medium"
    else:
        return "low"

df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

# Feature 4: Region Tier (adjusted to match your data's regions)
region_1_regions = ["Dar-es-salaam", "Miami", "Seattle"]
region_2_regions = ["Springfield"]
region_3_regions = ["Anytown"]  # Default others to 3

def region_tier(region):
    if region in region_1_regions:
        return 1
    elif region in region_2_regions:
        return 2
    else:
        return 3

df_feat["region_tier"] = df_feat["region"].apply(region_tier)

# Assign premium category (target)
def assign_premium(row):
    if row['bmi'] >= 30 or row['lifestyle_risk'] == 'medium' or row['smoker']:
        return 'High'
    elif row['bmi'] >= 25:
        return 'Medium'
    else:
        return 'Low'

df_feat['premium_category'] = df_feat.apply(assign_premium, axis=1)

# Print some analytics
print(df_feat[['patient_id', 'gender', 'condition', 'bmi', 'age']])
print("\n--- Patients by Condition ---")
print(df_feat['condition'].value_counts())
print("\n--- Average BMI by Condition ---")
print(df_feat.groupby('condition')['bmi'].mean().round(2))
print("\n--- Average Age by Gender ---")
print(df_feat.groupby('gender')['age'].mean().round(1))

# Prepare features and target
X = df_feat.drop(columns=['patient_id', 'premium_category'])
y = df_feat['premium_category']
print(X.columns)

# Define categorical and numeric features
categorical_features = ['gender', 'region', 'area', 'condition', 'occupation', 'lifestyle_risk', 'age_group']
numeric_features = ['age', 'height_cm', 'weight_kg', 'income_lpa', 'bmi', 'region_tier']

print(set(categorical_features).issubset(X.columns))
print("X columns:\n", list(X.columns))
missing = set(categorical_features) - set(X.columns)
extra = set(X.columns) - set(categorical_features)
print("\nMissing categorical columns:", missing)
print("\nExtra columns in X:", extra)

# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Optional: View some rows from X_test
if len(X_test) >= 5:
    print(X_test.sample(5))
else:
    print(X_test)

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)

    patient_id  age  gender  height_cm  weight_kg         region        area  \
0  patient-001   40    male      180.0       85.0  Dar-es-salaam     Mbagala   
1  patient-002   35  female      165.0       70.0  Dar-es-salaam     Tandika   
2  patient-003   53  female      160.0       65.0  Dar-es-salaam     Msasani   
3  patient-004   60    male      175.0       80.0  Dar-es-salaam   Kigamboni   
4  patient-005   25  female      170.0       60.0  Dar-es-salaam        Kawe   
5  patient-006   25    male       56.0       71.0  Dar-es-salaam    Magomeni   
6  PATIENT-007   34    male      178.0       71.0  Dar-Es-Salaam  Vingunguti   

                      condition  income_lpa  smoker   occupation     bmi  \
0                           NaN        50.0   False  private_job   26.23   
1      Type 2 diabetes mellitus        50.0   False  private_job   25.71   
2                        Asthma        50.0   False  private_job   25.39   
3                 Low back pain        50.0   False    

In [258]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load the CSV
df = pd.read_csv(r'C:/Users/Amani/FASTAPI+ML/patients_data.csv')
df_feat = df.copy()


df.head(10)
df_feat.info()

# Feature 1: BMI (correct height to meters)
df_feat["bmi"] = df_feat["weight_kg"] / ((df_feat["height_cm"] / 100) ** 2)

# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

df_feat["age_group"] = df_feat["age"].apply(age_group)

# Feature 3: Lifestyle Risk (adapted – no 'smoker', use BMI and condition)
def lifestyle_risk(row):
    if row["bmi"] > 30:
        return "high"
    elif row["bmi"] > 27 or "diabetes" in row["condition"].lower():
        return "medium"
    else:
        return "low"
    

def lifestyle_risk(row):
    condition = str(row["condition"]).lower() if pd.notna(row["condition"]) else ""
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["bmi"] > 27 or "diabetes" in condition:
        return "medium"
    else:
        return "low"
    
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1) 

# Feature 4: Region Tier (adjusted to match your data's regions)
region_1_regions = ["Dar-es-salaam", "Miami", "Seattle"]
region_2_regions = ["Springfield"]
region_3_regions = ["Anytown"]  # Default others to 3


def region_tier(region):
    if region in region_1_regions:
        return 1
    elif region in region_2_regions:
        return 2
    else:
        return 3
    
df_feat["region_tier"] = df_feat["region"].apply(region_tier)

def assign_premium(row):
    if row['bmi'] >= 30 or row['lifestyle_risk'] == 'medium' or row['smoker']:
        return 'High'
    elif row['bmi'] >= 25:
        return 'Medium'
    else:
        return 'Low'

df_feat['premium_category'] = df_feat.apply(assign_premium, axis=1)

# Print some analytics (as in your code)
print(df_feat[['patient_id', 'gender', 'condition', 'bmi', 'age']])
print("\n--- Patients by Condition ---")
print(df_feat['condition'].value_counts())
print("\n--- Average BMI by Condition ---")
print(df_feat.groupby('condition')['bmi'].mean().round(2))
print("\n--- Average Age by Gender ---")
print(df_feat.groupby('gender')['age'].mean().round(1))

# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Create target if needed
# df_feat['premium_category'] = ...

X = df_feat.drop(columns=['patient_id', 'premium_category'])
y = df_feat['premium_category']

#X = pd.get_dummies(X, drop_first=True)

print(X.columns)
# Define categorical and numeric features (adjusted to existing columns)
categorical_features =['gender', 'region', 'area','condition', 'occupation','lifestyle_risk','age_group']
numeric_features =  ['age', 'height_cm', 'weight_kg','income_lpa', 'bmi', 'region_tier']

print(set(categorical_features).issubset(X.columns))

print("X columns:\n", list(X.columns))

missing = set(categorical_features) - set(X.columns)
extra = set(X.columns) - set(categorical_features)

print("\nMissing categorical columns:", missing)
print("\nExtra columns in X:", extra)

# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



# Optional: Safely sample or view some rows from X_test (replace sample(5) to avoid weights issue)
# if len(X_test) >= 5:
#     print(X_test.sample(5))  # Ensure no 'weights' arg here!
# else:
#     print(X_test)  # Or use head(5) for non-random view: print(X_test.head(5))
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   patient_id      7 non-null      object 
 1   age             7 non-null      int64  
 2   gender          7 non-null      object 
 3   height_cm       7 non-null      float64
 4   weight_kg       7 non-null      float64
 5   region          7 non-null      object 
 6   area            7 non-null      object 
 7   condition       5 non-null      object 
 8   income_lpa      7 non-null      float64
 9   smoker          7 non-null      bool   
 10  occupation      7 non-null      object 
 11  bmi             7 non-null      float64
 12  lifestyle_risk  7 non-null      object 
 13  age_group       7 non-null      object 
 14  region_tier     7 non-null      int64  
dtypes: bool(1), float64(4), int64(2), object(8)
memory usage: 923.0+ bytes
    patient_id  gender                     condition      

In [259]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [260]:
# Load the CSV
df = pd.read_csv(r'C:/Users/Amani/FASTAPI+ML/patients_data.csv')
df_feat = df.copy()

In [261]:
df.head(10)

Unnamed: 0,patient_id,age,gender,height_cm,weight_kg,region,area,condition,income_lpa,smoker,occupation,bmi,lifestyle_risk,age_group,region_tier
0,patient-001,40,male,180.0,85.0,Dar-es-salaam,Mbagala,,50.0,False,private_job,26.23,low,adult,3
1,patient-002,35,female,165.0,70.0,Dar-es-salaam,Tandika,Type 2 diabetes mellitus,50.0,False,private_job,25.71,low,adult,3
2,patient-003,53,female,160.0,65.0,Dar-es-salaam,Msasani,Asthma,50.0,False,private_job,25.39,low,middle_aged,3
3,patient-004,60,male,175.0,80.0,Dar-es-salaam,Kigamboni,Low back pain,50.0,False,retired,26.12,low,senior,3
4,patient-005,25,female,170.0,60.0,Dar-es-salaam,Kawe,Generalized anxiety disorder,50.0,False,private_job,20.76,low,adult,3
5,patient-006,25,male,56.0,71.0,Dar-es-salaam,Magomeni,,2.5,False,private_job,226.4,medium,adult,3
6,PATIENT-007,34,male,178.0,71.0,Dar-Es-Salaam,Vingunguti,Hypertension,62.4,True,retired,22.41,medium,adult,2


In [262]:
df_feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   patient_id      7 non-null      object 
 1   age             7 non-null      int64  
 2   gender          7 non-null      object 
 3   height_cm       7 non-null      float64
 4   weight_kg       7 non-null      float64
 5   region          7 non-null      object 
 6   area            7 non-null      object 
 7   condition       5 non-null      object 
 8   income_lpa      7 non-null      float64
 9   smoker          7 non-null      bool   
 10  occupation      7 non-null      object 
 11  bmi             7 non-null      float64
 12  lifestyle_risk  7 non-null      object 
 13  age_group       7 non-null      object 
 14  region_tier     7 non-null      int64  
dtypes: bool(1), float64(4), int64(2), object(8)
memory usage: 923.0+ bytes


In [263]:
# Feature 1: BMI (correct height to meters)
df_feat["bmi"] = df_feat["weight_kg"] / ((df_feat["height_cm"] / 100) ** 2)

In [264]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [265]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [266]:
# Feature 3: Lifestyle Risk (adapted – no 'smoker', use BMI and condition)
def lifestyle_risk(row):
    if row["bmi"] > 30:
        return "high"
    elif row["bmi"] > 27 or "diabetes" in row["condition"].lower():
        return "medium"
    else:
        return "low"

In [267]:
def lifestyle_risk(row):
    condition = str(row["condition"]).lower() if pd.notna(row["condition"]) else ""
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["bmi"] > 27 or "diabetes" in condition:
        return "medium"
    else:
        return "low"

In [268]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [269]:
# Feature 4: Region Tier (adjusted to match your data's regions)
region_1_regions = ["Dar-es-salaam", "Miami", "Seattle"]
region_2_regions = ["Springfield"]
region_3_regions = ["Anytown"]  # Default others to 3

In [270]:
def region_tier(region):
    if region in region_1_regions:
        return 1
    elif region in region_2_regions:
        return 2
    else:
        return 3

In [271]:
df_feat["region_tier"] = df_feat["region"].apply(region_tier)

def assign_premium(row):
    if row['bmi'] >= 30 or row['lifestyle_risk'] == 'medium' or row['smoker']:
        return 'High'
    elif row['bmi'] >= 25:
        return 'Medium'
    else:
        return 'Low'

df_feat['premium_category'] = df_feat.apply(assign_premium, axis=1)

# Print some analytics (as in your code)
print(df_feat[['patient_id', 'gender', 'condition', 'bmi', 'age']])
print("\n--- Patients by Condition ---")
print(df_feat['condition'].value_counts())
print("\n--- Average BMI by Condition ---")
print(df_feat.groupby('condition')['bmi'].mean().round(2))
print("\n--- Average Age by Gender ---")
print(df_feat.groupby('gender')['age'].mean().round(1))

# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Create target if needed
# df_feat['premium_category'] = ...

X = df_feat.drop(columns=['patient_id', 'premium_category'])
y = df_feat['premium_category']

#X = pd.get_dummies(X, drop_first=True)

print(X.columns)
# Define categorical and numeric features (adjusted to existing columns)
categorical_features =['gender', 'region', 'area','condition', 'occupation','lifestyle_risk','age_group']
numeric_features =  ['age', 'height_cm', 'weight_kg','income_lpa', 'bmi', 'region_tier']

print(set(categorical_features).issubset(X.columns))

print("X columns:\n", list(X.columns))

missing = set(categorical_features) - set(X.columns)
extra = set(X.columns) - set(categorical_features)

print("\nMissing categorical columns:", missing)
print("\nExtra columns in X:", extra)

# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



# Optional: Safely sample or view some rows from X_test (replace sample(5) to avoid weights issue)
# if len(X_test) >= 5:
#     print(X_test.sample(5))  # Ensure no 'weights' arg here!
# else:
#     print(X_test)  # Or use head(5) for non-random view: print(X_test.head(5))
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)



    patient_id  gender                     condition         bmi  age
0  patient-001    male                           NaN   26.234568   40
1  patient-002  female      Type 2 diabetes mellitus   25.711662   35
2  patient-003  female                        Asthma   25.390625   53
3  patient-004    male                 Low back pain   26.122449   60
4  patient-005  female  Generalized anxiety disorder   20.761246   25
5  patient-006    male                           NaN  226.403061   25
6  PATIENT-007    male                  Hypertension   22.408787   34

--- Patients by Condition ---
condition
Type 2 diabetes mellitus        1
Asthma                          1
Low back pain                   1
Generalized anxiety disorder    1
Hypertension                    1
Name: count, dtype: int64

--- Average BMI by Condition ---
condition
Asthma                          25.39
Generalized anxiety disorder    20.76
Hypertension                    22.41
Low back pain                   26.12
Type 2 

0.0
              precision    recall  f1-score   support

        High       0.00      0.00      0.00       1.0
      Medium       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



In [272]:
df_feat["region_tier"] = df_feat["region"].apply(region_tier)

In [273]:
def assign_premium(row):
    if row['bmi'] >= 30 or row['lifestyle_risk'] == 'medium' or row['smoker']:
        return 'High'
    elif row['bmi'] >= 25:
        return 'Medium'
    else:
        return 'Low'

df_feat['premium_category'] = df_feat.apply(assign_premium, axis=1)


In [274]:
# Print some analytics (as in your code)
print(df_feat[['patient_id', 'gender', 'condition', 'bmi', 'age']])
print("\n--- Patients by Condition ---")
print(df_feat['condition'].value_counts())
print("\n--- Average BMI by Condition ---")
print(df_feat.groupby('condition')['bmi'].mean().round(2))
print("\n--- Average Age by Gender ---")
print(df_feat.groupby('gender')['age'].mean().round(1))

    patient_id  gender                     condition         bmi  age
0  patient-001    male                           NaN   26.234568   40
1  patient-002  female      Type 2 diabetes mellitus   25.711662   35
2  patient-003  female                        Asthma   25.390625   53
3  patient-004    male                 Low back pain   26.122449   60
4  patient-005  female  Generalized anxiety disorder   20.761246   25
5  patient-006    male                           NaN  226.403061   25
6  PATIENT-007    male                  Hypertension   22.408787   34

--- Patients by Condition ---
condition
Type 2 diabetes mellitus        1
Asthma                          1
Low back pain                   1
Generalized anxiety disorder    1
Hypertension                    1
Name: count, dtype: int64

--- Average BMI by Condition ---
condition
Asthma                          25.39
Generalized anxiety disorder    20.76
Hypertension                    22.41
Low back pain                   26.12
Type 2 

In [275]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [276]:
# Create target if needed
# df_feat['premium_category'] = ...

X = df_feat.drop(columns=['patient_id', 'premium_category'])
y = df_feat['premium_category']

#X = pd.get_dummies(X, drop_first=True)


In [277]:
print(X.columns)

Index(['age', 'gender', 'height_cm', 'weight_kg', 'region', 'area',
       'condition', 'income_lpa', 'smoker', 'occupation', 'bmi',
       'lifestyle_risk', 'age_group', 'region_tier'],
      dtype='object')


In [278]:
# Define categorical and numeric features (adjusted to existing columns)
categorical_features =['gender', 'region', 'area','condition', 'occupation','lifestyle_risk','age_group']
numeric_features =  ['age', 'height_cm', 'weight_kg','income_lpa', 'bmi', 'region_tier']


In [279]:
print(set(categorical_features).issubset(X.columns))

True


In [280]:
print("X columns:\n", list(X.columns))

missing = set(categorical_features) - set(X.columns)
extra = set(X.columns) - set(categorical_features)

print("\nMissing categorical columns:", missing)
print("\nExtra columns in X:", extra)


X columns:
 ['age', 'gender', 'height_cm', 'weight_kg', 'region', 'area', 'condition', 'income_lpa', 'smoker', 'occupation', 'bmi', 'lifestyle_risk', 'age_group', 'region_tier']

Missing categorical columns: set()

Extra columns in X: {'region_tier', 'smoker', 'weight_kg', 'bmi', 'income_lpa', 'age', 'height_cm'}


In [281]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.0
              precision    recall  f1-score   support

        High       0.00      0.00      0.00       1.0
      Medium       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



In [282]:

# Optional: Safely sample or view some rows from X_test (replace sample(5) to avoid weights issue)
# if len(X_test) >= 5:
#     print(X_test.sample(5))  # Ensure no 'weights' arg here!
# else:
#     print(X_test)  # Or use head(5) for non-random view: print(X_test.head(5))
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)