In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load the CSV
df = pd.read_csv(r'C:/Users/Amani/FASTAPI+ML/patients_data.csv')
df_feat = df.copy()

# Feature 1: BMI (correct height to meters)
df_feat["bmi"] = df_feat["weight_kg"] / ((df_feat["height_cm"] / 100) ** 2)

# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

df_feat["age_group"] = df_feat["age"].apply(age_group)

# Feature 3: Lifestyle Risk (adapted – no 'smoker', use BMI and condition)
def lifestyle_risk(row):
    if row["bmi"] > 30:
        return "high"
    elif row["bmi"] > 27 or "diabetes" in row["condition"].lower():
        return "medium"
    else:
        return "low"

df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

# Feature 4: City Tier (adjusted to match your data's US cities)
tier_1_cities = ["New York", "Miami", "Seattle"]
tier_2_cities = ["Springfield"]
tier_3_cities = ["Anytown"]  # Default others to 3

def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

df_feat["city_tier"] = df_feat["city"].apply(city_tier)

# Print some analytics (as in your code)
print(df_feat[['id', 'full_name', 'condition', 'bmi', 'age']])
print("\n--- Patients by Condition ---")
print(df_feat['condition'].value_counts())
print("\n--- Average BMI by Condition ---")
print(df_feat.groupby('condition')['bmi'].mean().round(2))
print("\n--- Average Age by Gender ---")
print(df_feat.groupby('gender')['age'].mean().round(1))

# Define categorical and numeric features (adjusted to existing columns)
categorical_features = ["age_group", "lifestyle_risk", "city_tier", "gender", "condition", "city", "state"]
numeric_features = ["age", "bmi", "height_cm", "weight_kg"]

# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# X and y (drop non-features; assume 'premium_category' exists from previous)
X = df_feat.drop(columns=['premium_category', 'id', 'full_name', 'given_name', 'family_name', 'birthDate', 'full_address', 'contact'])
y = df_feat['premium_category']

# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

X_test.sample(5)

import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)


            id     full_name                     condition        bmi  age
0  patient-001      John Doe                  Hypertension  26.234568   40
1  patient-002    Jane Smith      Type 2 diabetes mellitus  25.711662   35
2  patient-003  Maria Garcia                        Asthma  25.390625   53
3  patient-004     David Lee                 Low back pain  26.122449   60
4  patient-005   Aisha Patel  Generalized anxiety disorder  20.761246   25

--- Patients by Condition ---
condition
Hypertension                    1
Type 2 diabetes mellitus        1
Asthma                          1
Low back pain                   1
Generalized anxiety disorder    1
Name: count, dtype: int64

--- Average BMI by Condition ---
condition
Asthma                          25.39
Generalized anxiety disorder    20.76
Hypertension                    26.23
Low back pain                   26.12
Type 2 diabetes mellitus        25.71
Name: bmi, dtype: float64

--- Average Age by Gender ---
gender
female    37.7


ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
X_test.sample(5)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
df = pd.read_csv(r'C:/Users/Amani/FASTAPI+ML/patients_data.csv')


df.sample(5)
df['contact'].unique()


df_feat = df.copy()
df_feat.head()

# Feature 1: BMI
df_feat["bmi"] = df_feat["weight_kg"] / (df_feat["height_cm"] ** 2)

# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

df_feat["age_group"] = df_feat["age"].apply(age_group)

# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

# If you just want to use the existing condition column
print(df_feat['condition'].unique())  # See all conditions

# Example: Count patients per condition
condition_counts = df_feat['condition'].value_counts()
print(condition_counts)


import pandas as pd

# Load the flattened CSV
df = pd.read_csv('patients_data.csv')

# Now 'condition' is already a clean string column
print(df[['id', 'full_name', 'condition', 'bmi', 'age']])

# Examples of useful analytics
print("\n--- Patients by Condition ---")
print(df['condition'].value_counts())

print("\n--- Average BMI by Condition ---")
print(df.groupby('condition')['bmi'].mean().round(2))

print("\n--- Average Age by Gender ---")
print(df.groupby('gender')['age'].mean().round(1))

tier_1_cities = ["Dar-es-salaam", "Tanga", "Tabora", "Kigoma", "Bukoba", "Morogoro", "Pwani"]
tier_2_cities = [
    "Isevya", "Mbagala", "Nzega", "Urambo", "Masasi", "Tandahimba", "Kisauni", "Kizimkazi",
    "Mikocheni", "Kawe", "Kigamboni", "Chanika", "Zingiziwa", "Pugu", "Gongo la Mboto", "Igoma", "Karagwe",
    "Paje", "Tandika", "Mtongani", "KwazizAlly", "Uhamiaji", "Kigogo", "Kamata", "Kariakoo",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]   

# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3


df_feat["city_tier"] = df_feat["city"].apply(city_tier)

df[['id', 'full_name', 'given_name', 'family_name', 'gender', 'age', 'birthDate', 
    'city', 'state', 'full_address', 'contact', 'condition', 
    'height_cm', 'weight_kg', 'bmi']].sample(5)


X = df_feat[[
'id', 'full_name', 'given_name', 'family_name', 'gender', 'age', 'birthDate',
'city', 'state', 'full_address', 'contact', 'condition',
'height_cm', 'weight_kg', 'bmi', 'age_group'  # ← Added here
]]
y = df_feat["premium_category"]


X

y


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline  # Import the Pipeline class

# Assuming you have defined your pipeline steps, e.g.:
# pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

# Predict and evaluate
from sklearn.metrics import accuracy_score  # Import accuracy_score if not already

# Assuming pipeline is a fitted Pipeline instance; if not, define it here, e.g.:
# from sklearn.pipeline import Pipeline
# pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)


# Define categorical and numeric features
categorical_features = ["age_group", "condition", "full_address", "city"]
numeric_features = ["bmi", "weight_kg"]

# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)
     

In [None]:
df = pd.read_csv(r'C:/Users/Amani/FASTAPI+ML/patients_data.csv')

In [None]:
df.sample(5)

Unnamed: 0,id,full_name,given_name,family_name,gender,age,birthDate,city,state,full_address,contact,condition,height_cm,weight_kg,bmi,premium_category
4,patient-005,Aisha Patel,Aisha,Patel,female,25,2000-02-14,New York,NY,"202 Maple Ln, New York, NY 10001",aisha.patel@example.com,Generalized anxiety disorder,170,60,20.8,Low
3,patient-004,David Lee,David,Lee,male,60,1965-09-30,Seattle,WA,"101 Pine Rd, Seattle, WA 98101",(206) 456-7890,Low back pain,175,80,26.1,High
2,patient-003,Maria Garcia,Maria,Garcia,female,53,1972-11-05,Miami,FL,"789 Oak Ave, Miami, FL 33101",(305) 789-0123,Asthma,160,65,25.4,Medium
0,patient-001,John Doe,John,Doe,male,40,1985-03-15,Anytown,CA,"123 Main St, Anytown, CA 90210",(555) 123-4567,Hypertension,180,85,26.2,Medium
1,patient-002,Jane Smith,Jane,Smith,female,35,1990-07-22,Springfield,IL,"456 Elm St, Springfield, IL 62701",jane.smith@example.com,Type 2 diabetes mellitus,165,70,25.7,Medium


In [None]:
df['contact'].unique()

array(['(555) 123-4567', 'jane.smith@example.com', '(305) 789-0123',
       '(206) 456-7890', 'aisha.patel@example.com'], dtype=object)

In [None]:
df_feat = df.copy()
df_feat.head()

Unnamed: 0,id,full_name,given_name,family_name,gender,age,birthDate,city,state,full_address,contact,condition,height_cm,weight_kg,bmi,premium_category
0,patient-001,John Doe,John,Doe,male,40,1985-03-15,Anytown,CA,"123 Main St, Anytown, CA 90210",(555) 123-4567,Hypertension,180,85,26.2,Medium
1,patient-002,Jane Smith,Jane,Smith,female,35,1990-07-22,Springfield,IL,"456 Elm St, Springfield, IL 62701",jane.smith@example.com,Type 2 diabetes mellitus,165,70,25.7,Medium
2,patient-003,Maria Garcia,Maria,Garcia,female,53,1972-11-05,Miami,FL,"789 Oak Ave, Miami, FL 33101",(305) 789-0123,Asthma,160,65,25.4,Medium
3,patient-004,David Lee,David,Lee,male,60,1965-09-30,Seattle,WA,"101 Pine Rd, Seattle, WA 98101",(206) 456-7890,Low back pain,175,80,26.1,High
4,patient-005,Aisha Patel,Aisha,Patel,female,25,2000-02-14,New York,NY,"202 Maple Ln, New York, NY 10001",aisha.patel@example.com,Generalized anxiety disorder,170,60,20.8,Low


In [None]:
# Feature 1: BMI
df_feat["bmi"] = df_feat["weight_kg"] / (df_feat["height_cm"] ** 2)

In [None]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"


In [None]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [None]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [None]:
# If you just want to use the existing condition column
print(df_feat['condition'].unique())  # See all conditions

# Example: Count patients per condition
condition_counts = df_feat['condition'].value_counts()
print(condition_counts)

['Hypertension' 'Type 2 diabetes mellitus' 'Asthma' 'Low back pain'
 'Generalized anxiety disorder']
condition
Hypertension                    1
Type 2 diabetes mellitus        1
Asthma                          1
Low back pain                   1
Generalized anxiety disorder    1
Name: count, dtype: int64


In [None]:
import pandas as pd

# Load the flattened CSV
df = pd.read_csv('patients_data.csv')

# Now 'condition' is already a clean string column
print(df[['id', 'full_name', 'condition', 'bmi', 'age']])

# Examples of useful analytics
print("\n--- Patients by Condition ---")
print(df['condition'].value_counts())

print("\n--- Average BMI by Condition ---")
print(df.groupby('condition')['bmi'].mean().round(2))

print("\n--- Average Age by Gender ---")
print(df.groupby('gender')['age'].mean().round(1))

            id     full_name                     condition   bmi  age
0  patient-001      John Doe                  Hypertension  26.2   40
1  patient-002    Jane Smith      Type 2 diabetes mellitus  25.7   35
2  patient-003  Maria Garcia                        Asthma  25.4   53
3  patient-004     David Lee                 Low back pain  26.1   60
4  patient-005   Aisha Patel  Generalized anxiety disorder  20.8   25

--- Patients by Condition ---
condition
Hypertension                    1
Type 2 diabetes mellitus        1
Asthma                          1
Low back pain                   1
Generalized anxiety disorder    1
Name: count, dtype: int64

--- Average BMI by Condition ---
condition
Asthma                          25.4
Generalized anxiety disorder    20.8
Hypertension                    26.2
Low back pain                   26.1
Type 2 diabetes mellitus        25.7
Name: bmi, dtype: float64

--- Average Age by Gender ---
gender
female    37.7
male      50.0
Name: age, dtype: fl

In [None]:
tier_1_cities = ["Dar-es-salaam", "Tanga", "Tabora", "Kigoma", "Bukoba", "Morogoro", "Pwani"]
tier_2_cities = [
    "Isevya", "Mbagala", "Nzega", "Urambo", "Masasi", "Tandahimba", "Kisauni", "Kizimkazi",
    "Mikocheni", "Kawe", "Kigamboni", "Chanika", "Zingiziwa", "Pugu", "Gongo la Mboto", "Igoma", "Karagwe",
    "Paje", "Tandika", "Mtongani", "KwazizAlly", "Uhamiaji", "Kigogo", "Kamata", "Kariakoo",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]   

In [None]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [None]:

df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [None]:
df[['id', 'full_name', 'given_name', 'family_name', 'gender', 'age', 'birthDate', 
    'city', 'state', 'full_address', 'contact', 'condition', 
    'height_cm', 'weight_kg', 'bmi']].sample(5)

Unnamed: 0,id,full_name,given_name,family_name,gender,age,birthDate,city,state,full_address,contact,condition,height_cm,weight_kg,bmi
1,patient-002,Jane Smith,Jane,Smith,female,35,1990-07-22,Springfield,IL,"456 Elm St, Springfield, IL 62701",jane.smith@example.com,Type 2 diabetes mellitus,165,70,25.7
4,patient-005,Aisha Patel,Aisha,Patel,female,25,2000-02-14,New York,NY,"202 Maple Ln, New York, NY 10001",aisha.patel@example.com,Generalized anxiety disorder,170,60,20.8
2,patient-003,Maria Garcia,Maria,Garcia,female,53,1972-11-05,Miami,FL,"789 Oak Ave, Miami, FL 33101",(305) 789-0123,Asthma,160,65,25.4
0,patient-001,John Doe,John,Doe,male,40,1985-03-15,Anytown,CA,"123 Main St, Anytown, CA 90210",(555) 123-4567,Hypertension,180,85,26.2
3,patient-004,David Lee,David,Lee,male,60,1965-09-30,Seattle,WA,"101 Pine Rd, Seattle, WA 98101",(206) 456-7890,Low back pain,175,80,26.1


In [None]:
X = df_feat[[
'id', 'full_name', 'given_name', 'family_name', 'gender', 'age', 'birthDate',
'city', 'state', 'full_address', 'contact', 'condition',
'height_cm', 'weight_kg', 'bmi', 'age_group'  # ← Added here
]]
y = df_feat["premium_category"]

In [None]:
X

Unnamed: 0,id,full_name,given_name,family_name,gender,age,birthDate,city,state,full_address,contact,condition,height_cm,weight_kg,bmi,age_group
0,patient-001,John Doe,John,Doe,male,40,1985-03-15,Anytown,CA,"123 Main St, Anytown, CA 90210",(555) 123-4567,Hypertension,180,85,0.002623,adult
1,patient-002,Jane Smith,Jane,Smith,female,35,1990-07-22,Springfield,IL,"456 Elm St, Springfield, IL 62701",jane.smith@example.com,Type 2 diabetes mellitus,165,70,0.002571,adult
2,patient-003,Maria Garcia,Maria,Garcia,female,53,1972-11-05,Miami,FL,"789 Oak Ave, Miami, FL 33101",(305) 789-0123,Asthma,160,65,0.002539,middle_aged
3,patient-004,David Lee,David,Lee,male,60,1965-09-30,Seattle,WA,"101 Pine Rd, Seattle, WA 98101",(206) 456-7890,Low back pain,175,80,0.002612,senior
4,patient-005,Aisha Patel,Aisha,Patel,female,25,2000-02-14,New York,NY,"202 Maple Ln, New York, NY 10001",aisha.patel@example.com,Generalized anxiety disorder,170,60,0.002076,adult


In [None]:
y

0    Medium
1    Medium
2    Medium
3      High
4       Low
Name: premium_category, dtype: object

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline  # Import the Pipeline class

# Assuming you have defined your pipeline steps, e.g.:
# pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

AttributeError: module 'sklearn.pipeline' has no attribute 'fit'

In [None]:
# Predict and evaluate
from sklearn.metrics import accuracy_score  # Import accuracy_score if not already

# Assuming pipeline is a fitted Pipeline instance; if not, define it here, e.g.:
# from sklearn.pipeline import Pipeline
# pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

AttributeError: module 'sklearn.pipeline' has no attribute 'predict'

In [None]:
# Define categorical and numeric features
categorical_features = ["age_group", "condition", "full_address", "city"]
numeric_features = ["bmi", "weight_kg"]

In [None]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)
     

In [None]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [None]:

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [None]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:

# Define categorical and numeric features
#categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
#numeric_features = ["bmi", "income_lpa"]
     

# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)
     

In [None]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)
     

ValueError: Found unknown categories ['middle_aged'] in column 0 during transform

In [None]:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np



# Your JSON data (paste it here)
import json
from datetime import datetime
import csv

# Paste your data as a proper JSON string (lowercase true/false is fine here)
json_data = '''
[
  {
    "resourceType": "Patient",
    "id": "patient-001",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Doe",
        "given": ["John"]
      }
    ],
    "gender": "male",
    "birthDate": "1985-03-15",
    "address": [
      {
        "use": "home",
        "line": ["123 Main St"],
        "city": "Anytown",
        "state": "CA",
        "postalCode": "90210",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "phone",
        "value": "(555) 123-4567",
        "use": "mobile"
      }
    ],
    "conditions": [
      {
        "code": "I10",
        "display": "Hypertension"
      }
    ],
    "height": 180,
    "weight": 85,
    "bmi": 26.2
  },
  {
    "resourceType": "Patient",
    "id": "patient-002",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Smith",
        "given": ["Jane"]
      }
    ],
    "gender": "female",
    "birthDate": "1990-07-22",
    "address": [
      {
        "use": "home",
        "line": ["456 Elm St"],
        "city": "Springfield",
        "state": "IL",
        "postalCode": "62701",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "email",
        "value": "jane.smith@example.com",
        "use": "home"
      }
    ],
    "conditions": [
      {
        "code": "E11",
        "display": "Type 2 diabetes mellitus"
      }
    ],
    "height": 165,
    "weight": 70,
    "bmi": 25.7
  },
  {
    "resourceType": "Patient",
    "id": "patient-003",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Garcia",
        "given": ["Maria"]
      }
    ],
    "gender": "female",
    "birthDate": "1972-11-05",
    "address": [
      {
        "use": "home",
        "line": ["789 Oak Ave"],
        "city": "Miami",
        "state": "FL",
        "postalCode": "33101",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "phone",
        "value": "(305) 789-0123",
        "use": "work"
      }
    ],
    "conditions": [
      {
        "code": "J45",
        "display": "Asthma"
      }
    ],
    "height": 160,
    "weight": 65,
    "bmi": 25.4
  },
  {
    "resourceType": "Patient",
    "id": "patient-004",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Lee",
        "given": ["David"]
      }
    ],
    "gender": "male",
    "birthDate": "1965-09-30",
    "address": [
      {
        "use": "home",
        "line": ["101 Pine Rd"],
        "city": "Seattle",
        "state": "WA",
        "postalCode": "98101",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "phone",
        "value": "(206) 456-7890",
        "use": "home"
      }
    ],
    "conditions": [
      {
        "code": "M54.5",
        "display": "Low back pain"
      }
    ],
    "height": 175,
    "weight": 80,
    "bmi": 26.1
  },
  {
    "resourceType": "Patient",
    "id": "patient-005",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Patel",
        "given": ["Aisha"]
      }
    ],
    "gender": "female",
    "birthDate": "2000-02-14",
    "address": [
      {
        "use": "home",
        "line": ["202 Maple Ln"],
        "city": "New York",
        "state": "NY",
        "postalCode": "10001",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "email",
        "value": "aisha.patel@example.com",
        "use": "home"
      }
    ],
    "conditions": [
      {
        "code": "F41.1",
        "display": "Generalized anxiety disorder"
      }
    ],
    "height": 170,
    "weight": 60,
    "bmi": 20.8
  }
]
'''

# Now parse it safely
patients = json.loads(json_data)

# Flatten and process
rows = []
current_date = datetime(2025, 12, 31)

for p in patients:
    name = p['name'][0]
    given = ' '.join(name['given'])
    family = name['family']
    full_name = f"{given} {family}"
    
    address = p['address'][0]
    full_address = ', '.join(address['line']) + f", {address['city']}, {address['state']} {address['postalCode']}"
    
    telecom = p['telecom'][0] if p['telecom'] else {}
    contact = telecom.get('value', '')
    
    condition = p['conditions'][0] if p['conditions'] else {}
    condition_display = condition.get('display', '')
    
    # Calculate age
    birth_date = datetime.strptime(p['birthDate'], "%Y-%m-%d")
    age = current_date.year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
    
    rows.append({
        'id': p['id'],
        'full_name': full_name,
        'given_name': given,
        'family_name': family,
        'gender': p['gender'],
        'age': age,
        'birthDate': p['birthDate'],
        'city': address['city'],
        'state': address['state'],
        'full_address': full_address,
        'contact': contact,
        'condition': condition_display,
        'height_cm': p.get('height'),
        'weight_kg': p.get('weight'),
        'bmi': p.get('bmi')
    })

# Save to CSV
with open('patients_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=rows[0].keys())
    writer.writeheader()
    writer.writerows(rows)

print("CSV saved successfully as 'patients_data.csv'")

patients = json.loads(json_data)

# Flatten and process
rows = []
current_date = datetime(2025, 12, 31)

for p in patients:
    name = p['name'][0]
    given = ' '.join(name['given'])
    family = name['family']
    full_name = f"{given} {family}"
    
    address = p['address'][0]
    full_address = ', '.join(address['line']) + f", {address['city']}, {address['state']} {address['postalCode']}"
    
    telecom = p['telecom'][0] if p['telecom'] else {}
    contact = telecom.get('value', '')
    
    condition = p['conditions'][0] if p['conditions'] else {}
    condition_display = condition.get('display', '')
    
    # Calculate age
    birth_date = datetime.strptime(p['birthDate'], "%Y-%m-%d")
    age = current_date.year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
    
    rows.append({
        'id': p['id'],
        'full_name': full_name,
        'given_name': given,
        'family_name': family,
        'gender': p['gender'],
        'age': age,
        'birthDate': p['birthDate'],
        'city': address['city'],
        'state': address['state'],
        'full_address': full_address,
        'contact': contact,
        'condition': condition_display,
        'height_cm': p.get('height'),
        'weight_kg': p.get('weight'),
        'bmi': p.get('bmi')
    })

# Save to CSV
with open('patients_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=rows[0].keys())
    writer.writeheader()
    writer.writerows(rows)

print("CSV saved as patients_data.csv")


import json
from datetime import datetime
import csv

# Your JSON data as a string
json_data = '''
[
  {
    "resourceType": "Patient",
    "id": "patient-001",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Doe",
        "given": ["John"]
      }
    ],
    "gender": "male",
    "birthDate": "1985-03-15",
    "address": [
      {
        "use": "home",
        "line": ["123 Main St"],
        "city": "Anytown",
        "state": "CA",
        "postalCode": "90210",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "phone",
        "value": "(555) 123-4567",
        "use": "mobile"
      }
    ],
    "conditions": [
      {
        "code": "I10",
        "display": "Hypertension"
      }
    ],
    "height": 180,
    "weight": 85,
    "bmi": 26.2
  },
  {
    "resourceType": "Patient",
    "id": "patient-002",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Smith",
        "given": ["Jane"]
      }
    ],
    "gender": "female",
    "birthDate": "1990-07-22",
    "address": [
      {
        "use": "home",
        "line": ["456 Elm St"],
        "city": "Springfield",
        "state": "IL",
        "postalCode": "62701",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "email",
        "value": "jane.smith@example.com",
        "use": "home"
      }
    ],
    "conditions": [
      {
        "code": "E11",
        "display": "Type 2 diabetes mellitus"
      }
    ],
    "height": 165,
    "weight": 70,
    "bmi": 25.7
  },
  {
    "resourceType": "Patient",
    "id": "patient-003",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Garcia",
        "given": ["Maria"]
      }
    ],
    "gender": "female",
    "birthDate": "1972-11-05",
    "address": [
      {
        "use": "home",
        "line": ["789 Oak Ave"],
        "city": "Miami",
        "state": "FL",
        "postalCode": "33101",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "phone",
        "value": "(305) 789-0123",
        "use": "work"
      }
    ],
    "conditions": [
      {
        "code": "J45",
        "display": "Asthma"
      }
    ],
    "height": 160,
    "weight": 65,
    "bmi": 25.4
  },
  {
    "resourceType": "Patient",
    "id": "patient-004",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Lee",
        "given": ["David"]
      }
    ],
    "gender": "male",
    "birthDate": "1965-09-30",
    "address": [
      {
        "use": "home",
        "line": ["101 Pine Rd"],
        "city": "Seattle",
        "state": "WA",
        "postalCode": "98101",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "phone",
        "value": "(206) 456-7890",
        "use": "home"
      }
    ],
    "conditions": [
      {
        "code": "M54.5",
        "display": "Low back pain"
      }
    ],
    "height": 175,
    "weight": 80,
    "bmi": 26.1
  },
  {
    "resourceType": "Patient",
    "id": "patient-005",
    "active": true,
    "name": [
      {
        "use": "official",
        "family": "Patel",
        "given": ["Aisha"]
      }
    ],
    "gender": "female",
    "birthDate": "2000-02-14",
    "address": [
      {
        "use": "home",
        "line": ["202 Maple Ln"],
        "city": "New York",
        "state": "NY",
        "postalCode": "10001",
        "country": "USA"
      }
    ],
    "telecom": [
      {
        "system": "email",
        "value": "aisha.patel@example.com",
        "use": "home"
      }
    ],
    "conditions": [
      {
        "code": "F41.1",
        "display": "Generalized anxiety disorder"
      }
    ],
    "height": 170,
    "weight": 60,
    "bmi": 20.8
  }
]
'''

# Parse JSON
patients = json.loads(json_data)

# Flatten and process
rows = []
current_date = datetime(2025, 12, 31)

for p in patients:
    name = p['name'][0]
    given = ' '.join(name['given'])
    family = name['family']
    full_name = f"{given} {family}"
    
    address = p['address'][0]
    full_address = ', '.join(address['line']) + f", {address['city']}, {address['state']} {address['postalCode']}"
    
    telecom = p['telecom'][0] if p['telecom'] else {}
    contact = telecom.get('value', '')
    
    condition = p['conditions'][0] if p['conditions'] else {}
    condition_display = condition.get('display', '')
    
    # Calculate age
    birth_date = datetime.strptime(p['birthDate'], "%Y-%m-%d")
    age = current_date.year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
    
    bmi = p.get('bmi', 0)
    
    # Determine premium_category: Low, Medium, or High
    if bmi < 25 and age < 40 and "Hypertension" not in condition_display and "diabetes" not in condition_display.lower():
        premium_category = "Low"
    elif bmi < 30 and age < 60:
        premium_category = "Medium"
    else:
        premium_category = "High"
    
    rows.append({
        'id': p['id'],
        'full_name': full_name,
        'given_name': given,
        'family_name': family,
        'gender': p['gender'],
        'age': age,
        'birthDate': p['birthDate'],
        'city': address['city'],
        'state': address['state'],
        'full_address': full_address,
        'contact': contact,
        'condition': condition_display,
        'height_cm': p.get('height'),
        'weight_kg': p.get('weight'),
        'bmi': bmi,
        'premium_category': premium_category  # ← NEW COLUMN
    })

# Save to CSV
with open('patients_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=rows[0].keys())
    writer.writeheader()
    writer.writerows(rows)

print("CSV saved successfully as 'patients_data.csv' with premium_category included!")



