# DATA GENERATION

In [26]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()

# Define possible attributes for each chronic condition
conditions = {
    "Diabetes Mellitus": {
        "Allergies": ["Penicillin", "Sulfa drugs", "Certain foods (e.g., shellfish, peanuts)", "Latex"],
        "Surgeries": ["Cataract surgery", "Amputation", "Bariatric surgery", "Coronary artery bypass"],
        "Disease Conditions": ["Hypertension", "Diabetic nephropathy", "Diabetic retinopathy", "Peripheral neuropathy"],
        "Medications": ["Metformin", "Insulin", "ACE inhibitors or ARBs", "Statins", "Aspirin"]
    },
    "Hypertension": {
        "Allergies": ["Certain medications (e.g., ACE inhibitors)", "Shellfish", "NSAIDs (in some cases)"],
        "Surgeries": ["Coronary artery bypass", "Renal artery surgery"],
        "Disease Conditions": ["Heart disease", "Stroke", "Kidney disease", "Retinopathy"],
        "Medications": ["ACE inhibitors", "Beta-blockers", "Diuretics", "Calcium channel blockers", "ARBs"]
    },
    "Chronic Obstructive Pulmonary Disease (COPD)": {
        "Allergies": ["Certain inhalers (e.g., beta-agonists)", "Dust", "Pollen", "Mold"],
        "Surgeries": ["Lung volume reduction surgery", "Lung transplant"],
        "Disease Conditions": ["Emphysema", "Chronic bronchitis", "Respiratory infections", "Heart failure"],
        "Medications": ["Inhaled corticosteroids", "Long-acting beta-agonists", "Anticholinergics", "Phosphodiesterase-4 inhibitors", "Mucolytics"]
    },
    "Heart Disease": {
        "Allergies": ["Certain anticoagulants (e.g., warfarin)", "Statins", "Contrast dyes"],
        "Surgeries": ["Angioplasty and stent placement", "Coronary artery bypass grafting (CABG)", "Valve replacement or repair"],
        "Disease Conditions": ["Myocardial infarction", "Heart failure", "Arrhythmias", "Peripheral artery disease"],
        "Medications": ["Statins", "Beta-blockers", "ACE inhibitors", "Antiplatelet agents (e.g., aspirin, clopidogrel)", "Diuretics"]
    },
    "Rheumatoid Arthritis": {
        "Allergies": ["Certain NSAIDs", "Disease-modifying antirheumatic drugs (DMARDs)", "Biologic agents (e.g., TNF inhibitors)"],
        "Surgeries": ["Joint replacement surgery (e.g., knee or hip)", "Synovectomy"],
        "Disease Conditions": ["Osteoporosis", "Joint deformities", "Cardiovascular disease", "Lung disease"],
        "Medications": ["Methotrexate", "Biologics (e.g., TNF inhibitors)", "NSAIDs", "Corticosteroids", "DMARDs"]
    }
}

def generate_data(num_samples):
    data = []
    
    for _ in range(num_samples):
        chronic_condition = random.choice(list(conditions.keys()))
        attributes = conditions[chronic_condition]
        
        record = {
            "patient_id": _ + 1,
            "name": fake.name(),
            "age": np.random.randint(18, 80),
            "location": fake.city(),
            "past_medication_done": random.choice(["Yes", "No"]),
            "allergies": np.random.choice(attributes["Allergies"], size=np.random.randint(1, 3), replace=False).tolist(),
            "chronic_condition": chronic_condition,
            "surgeries": np.random.choice(attributes["Surgeries"], size=np.random.randint(1, 2), replace=False).tolist(),
            "disease_condition": random.choice(attributes["Disease Conditions"]),
            "patient_condition": random.choice(["Stable", "Moderate", "Severe", "Improved", "Recovering"]),
            "current_medication": random.choice(attributes["Medications"])
        }
        
        data.append(record)
    
    df = pd.DataFrame(data)
    return df

# Generate 100 samples
df = generate_data(5000)
df

Unnamed: 0,patient_id,name,age,location,past_medication_done,allergies,chronic_condition,surgeries,disease_condition,patient_condition,current_medication
0,1,Tara Pena,68,Garymouth,Yes,[Pollen],Chronic Obstructive Pulmonary Disease (COPD),[Lung transplant],Heart failure,Recovering,Inhaled corticosteroids
1,2,Tony Wilson,49,Charlesville,Yes,"[Pollen, Dust]",Chronic Obstructive Pulmonary Disease (COPD),[Lung transplant],Emphysema,Severe,Phosphodiesterase-4 inhibitors
2,3,Frank Parker,28,Port Jessica,No,[Disease-modifying antirheumatic drugs (DMARDs)],Rheumatoid Arthritis,"[Joint replacement surgery (e.g., knee or hip)]",Cardiovascular disease,Stable,NSAIDs
3,4,Timothy Kennedy,39,South Andrea,Yes,[Statins],Heart Disease,[Coronary artery bypass grafting (CABG)],Peripheral artery disease,Recovering,Statins
4,5,Emily Craig,21,Port Carolinemouth,Yes,[Contrast dyes],Heart Disease,[Valve replacement or repair],Arrhythmias,Recovering,"Antiplatelet agents (e.g., aspirin, clopidogrel)"
...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Stephen Alvarado,21,North Anthony,Yes,[Latex],Diabetes Mellitus,[Amputation],Diabetic retinopathy,Recovering,ACE inhibitors or ARBs
4996,4997,Thomas Vasquez,21,Boothfurt,No,[Mold],Chronic Obstructive Pulmonary Disease (COPD),[Lung transplant],Heart failure,Severe,Inhaled corticosteroids
4997,4998,David Bauer,52,West Joshua,No,"[Certain medications (e.g., ACE inhibitors), N...",Hypertension,[Renal artery surgery],Kidney disease,Improved,ACE inhibitors
4998,4999,Chad Horn,46,Tamaramouth,Yes,"[Certain medications (e.g., ACE inhibitors)]",Hypertension,[Coronary artery bypass],Retinopathy,Severe,Diuretics


In [27]:
# Save as JSON
df.to_json('patient_data_with_medication.json', orient='records', lines=True)

print("Data generation complete and saved as JSON.")

Data generation complete and saved as JSON.


### 

#  Medication Prediction Model Evaluation

### 

In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the generated data
df = pd.read_json('patient_data_with_medication.json', orient='records', lines=True)
df

Unnamed: 0,patient_id,name,age,location,past_medication_done,allergies,chronic_condition,surgeries,disease_condition,patient_condition,current_medication
0,1,Tara Pena,68,Garymouth,Yes,[Pollen],Chronic Obstructive Pulmonary Disease (COPD),[Lung transplant],Heart failure,Recovering,Inhaled corticosteroids
1,2,Tony Wilson,49,Charlesville,Yes,"[Pollen, Dust]",Chronic Obstructive Pulmonary Disease (COPD),[Lung transplant],Emphysema,Severe,Phosphodiesterase-4 inhibitors
2,3,Frank Parker,28,Port Jessica,No,[Disease-modifying antirheumatic drugs (DMARDs)],Rheumatoid Arthritis,"[Joint replacement surgery (e.g., knee or hip)]",Cardiovascular disease,Stable,NSAIDs
3,4,Timothy Kennedy,39,South Andrea,Yes,[Statins],Heart Disease,[Coronary artery bypass grafting (CABG)],Peripheral artery disease,Recovering,Statins
4,5,Emily Craig,21,Port Carolinemouth,Yes,[Contrast dyes],Heart Disease,[Valve replacement or repair],Arrhythmias,Recovering,"Antiplatelet agents (e.g., aspirin, clopidogrel)"
...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Stephen Alvarado,21,North Anthony,Yes,[Latex],Diabetes Mellitus,[Amputation],Diabetic retinopathy,Recovering,ACE inhibitors or ARBs
4996,4997,Thomas Vasquez,21,Boothfurt,No,[Mold],Chronic Obstructive Pulmonary Disease (COPD),[Lung transplant],Heart failure,Severe,Inhaled corticosteroids
4997,4998,David Bauer,52,West Joshua,No,"[Certain medications (e.g., ACE inhibitors), N...",Hypertension,[Renal artery surgery],Kidney disease,Improved,ACE inhibitors
4998,4999,Chad Horn,46,Tamaramouth,Yes,"[Certain medications (e.g., ACE inhibitors)]",Hypertension,[Coronary artery bypass],Retinopathy,Severe,Diuretics


In [94]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a function to preprocess the data
def preprocess_data(df):
    # Convert lists to strings
    df['allergies'] = df['allergies'].apply(lambda x: ','.join(x) if isinstance(x, (list, np.ndarray)) else x)
    df['surgeries'] = df['surgeries'].apply(lambda x: ','.join(x) if isinstance(x, (list, np.ndarray)) else x)
    
    # Drop unnecessary columns for modeling, but only if they exist in the DataFrame
    columns_to_drop = ['location', 'name']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], axis=1)
    
    return df

# Preprocess the data
df_preprocessed = preprocess_data(df)

# Define feature columns and target column
target_column = 'current_medication'
feature_columns = [col for col in df_preprocessed.columns if col != target_column]

# Separate features and target
X = df_preprocessed[feature_columns]
y = df_preprocessed[target_column]

# Encoding pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ]
)

# Create a pipeline that includes preprocessing and the classifier
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [95]:
X

Unnamed: 0,patient_id,age,past_medication_done,allergies,chronic_condition,surgeries,disease_condition,patient_condition
0,1,68,Yes,Pollen,Chronic Obstructive Pulmonary Disease (COPD),Lung transplant,Heart failure,Recovering
1,2,49,Yes,"Pollen,Dust",Chronic Obstructive Pulmonary Disease (COPD),Lung transplant,Emphysema,Severe
2,3,28,No,Disease-modifying antirheumatic drugs (DMARDs),Rheumatoid Arthritis,"Joint replacement surgery (e.g., knee or hip)",Cardiovascular disease,Stable
3,4,39,Yes,Statins,Heart Disease,Coronary artery bypass grafting (CABG),Peripheral artery disease,Recovering
4,5,21,Yes,Contrast dyes,Heart Disease,Valve replacement or repair,Arrhythmias,Recovering
...,...,...,...,...,...,...,...,...
4995,4996,21,Yes,Latex,Diabetes Mellitus,Amputation,Diabetic retinopathy,Recovering
4996,4997,21,No,Mold,Chronic Obstructive Pulmonary Disease (COPD),Lung transplant,Heart failure,Severe
4997,4998,52,No,"Certain medications (e.g., ACE inhibitors),NSA...",Hypertension,Renal artery surgery,Kidney disease,Improved
4998,4999,46,Yes,"Certain medications (e.g., ACE inhibitors)",Hypertension,Coronary artery bypass,Retinopathy,Severe


In [96]:
X.shape

(5000, 8)

In [97]:
y

0                                Inhaled corticosteroids
1                         Phosphodiesterase-4 inhibitors
2                                                 NSAIDs
3                                                Statins
4       Antiplatelet agents (e.g., aspirin, clopidogrel)
                              ...                       
4995                              ACE inhibitors or ARBs
4996                             Inhaled corticosteroids
4997                                      ACE inhibitors
4998                                           Diuretics
4999                                      ACE inhibitors
Name: current_medication, Length: 5000, dtype: object

In [98]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
X_test

Unnamed: 0,patient_id,age,past_medication_done,allergies,chronic_condition,surgeries,disease_condition,patient_condition
1501,1502,73,No,Penicillin,Diabetes Mellitus,Coronary artery bypass,Hypertension,Recovering
2586,2587,41,No,Sulfa drugs,Diabetes Mellitus,Coronary artery bypass,Diabetic retinopathy,Improved
2653,2654,36,Yes,Latex,Diabetes Mellitus,Cataract surgery,Hypertension,Recovering
1055,1056,69,No,"Contrast dyes,Certain anticoagulants (e.g., wa...",Heart Disease,Coronary artery bypass grafting (CABG),Myocardial infarction,Recovering
705,706,50,No,"Biologic agents (e.g., TNF inhibitors),Certain...",Rheumatoid Arthritis,"Joint replacement surgery (e.g., knee or hip)",Lung disease,Improved
...,...,...,...,...,...,...,...,...
4711,4712,63,No,"Certain NSAIDs,Disease-modifying antirheumatic...",Rheumatoid Arthritis,Synovectomy,Lung disease,Recovering
2313,2314,57,No,"Sulfa drugs,Penicillin",Diabetes Mellitus,Amputation,Diabetic retinopathy,Improved
3214,3215,35,Yes,Statins,Heart Disease,Angioplasty and stent placement,Arrhythmias,Improved
2732,2733,35,No,"Latex,Penicillin",Diabetes Mellitus,Coronary artery bypass,Peripheral neuropathy,Moderate


In [100]:
# Train the model
model_pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))



                                                  precision    recall  f1-score   support

                                  ACE inhibitors       0.24      0.20      0.22        80
                          ACE inhibitors or ARBs       0.16      0.26      0.20        34
                                            ARBs       0.22      0.25      0.23        32
                                Anticholinergics       0.23      0.25      0.24        36
Antiplatelet agents (e.g., aspirin, clopidogrel)       0.28      0.30      0.29        46
                                         Aspirin       0.22      0.22      0.22        36
                                   Beta-blockers       0.30      0.30      0.30        79
                Biologics (e.g., TNF inhibitors)       0.20      0.31      0.24        32
                        Calcium channel blockers       0.18      0.17      0.17        35
                                 Corticosteroids       0.21      0.15      0.18        40
         

In [101]:
print("Medication Prediction Model Evaluation:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Medication Prediction Model Evaluation:
                                                  precision    recall  f1-score   support

                                  ACE inhibitors       0.24      0.20      0.22        80
                          ACE inhibitors or ARBs       0.16      0.26      0.20        34
                                            ARBs       0.22      0.25      0.23        32
                                Anticholinergics       0.23      0.25      0.24        36
Antiplatelet agents (e.g., aspirin, clopidogrel)       0.28      0.30      0.29        46
                                         Aspirin       0.22      0.22      0.22        36
                                   Beta-blockers       0.30      0.30      0.30        79
                Biologics (e.g., TNF inhibitors)       0.20      0.31      0.24        32
                        Calcium channel blockers       0.18      0.17      0.17        35
                                 Corticosteroids       0.21

In [102]:
# Define custom input
custom_input = pd.DataFrame({
    'patient_id': [1],
    'age': [68],
    'past_medication_done': ['yes'],  # 1 indicates "Yes"
    'allergies': ['pollen'],
    'chronic_condition': ['Chronic Obstructive Pulmonary Disease (COPD)'],
    'surgeries': ['Lung transplant'],
    'disease_condition': ['Heart failure'],
    'patient_condition': ['Recovering']
})

# Preprocess custom input using the same preprocessing function
custom_input_preprocessed = preprocess_data(custom_input)

# Predict using the model pipeline
custom_prediction = model_pipeline.predict(custom_input_preprocessed)
print("Custom Input Prediction:", custom_prediction)

Custom Input Prediction: ['Inhaled corticosteroids']


In [103]:
# Save the model (Optional)
joblib.dump(model_pipeline, 'medication_prediction.pkl')
print("Model pipeline saved as 'medication_prediction_model_pipeline.pkl'")

Model pipeline saved as 'medication_prediction_model_pipeline.pkl'


### 

#  Insurance Prediction Model Evaluation


### 

In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()

# Define possible attributes for each chronic condition
conditions = {
    "Diabetes Mellitus": {
        "Allergies": ["Penicillin", "Sulfa drugs", "Certain foods (e.g., shellfish, peanuts)", "Latex"],
        "Surgeries": ["Cataract surgery", "Amputation", "Bariatric surgery", "Coronary artery bypass"],
        "Disease Conditions": ["Hypertension", "Diabetic nephropathy", "Diabetic retinopathy", "Peripheral neuropathy"],
        "Medications": ["Metformin", "Insulin", "ACE inhibitors or ARBs", "Statins", "Aspirin"]
    },
    "Hypertension": {
        "Allergies": ["Certain medications (e.g., ACE inhibitors)", "Shellfish", "NSAIDs (in some cases)"],
        "Surgeries": ["Coronary artery bypass", "Renal artery surgery"],
        "Disease Conditions": ["Heart disease", "Stroke", "Kidney disease", "Retinopathy"],
        "Medications": ["ACE inhibitors", "Beta-blockers", "Diuretics", "Calcium channel blockers", "ARBs"]
    },
    "Chronic Obstructive Pulmonary Disease (COPD)": {
        "Allergies": ["Certain inhalers (e.g., beta-agonists)", "Dust", "Pollen", "Mold"],
        "Surgeries": ["Lung volume reduction surgery", "Lung transplant"],
        "Disease Conditions": ["Emphysema", "Chronic bronchitis", "Respiratory infections", "Heart failure"],
        "Medications": ["Inhaled corticosteroids", "Long-acting beta-agonists", "Anticholinergics", "Phosphodiesterase-4 inhibitors", "Mucolytics"]
    },
    "Heart Disease": {
        "Allergies": ["Certain anticoagulants (e.g., warfarin)", "Statins", "Contrast dyes"],
        "Surgeries": ["Angioplasty and stent placement", "Coronary artery bypass grafting (CABG)", "Valve replacement or repair"],
        "Disease Conditions": ["Myocardial infarction", "Heart failure", "Arrhythmias", "Peripheral artery disease"],
        "Medications": ["Statins", "Beta-blockers", "ACE inhibitors", "Antiplatelet agents (e.g., aspirin, clopidogrel)", "Diuretics"]
    },
    "Rheumatoid Arthritis": {
        "Allergies": ["Certain NSAIDs", "Disease-modifying antirheumatic drugs (DMARDs)", "Biologic agents (e.g., TNF inhibitors)"],
        "Surgeries": ["Joint replacement surgery (e.g., knee or hip)", "Synovectomy"],
        "Disease Conditions": ["Osteoporosis", "Joint deformities", "Cardiovascular disease", "Lung disease"],
        "Medications": ["Methotrexate", "Biologics (e.g., TNF inhibitors)", "NSAIDs", "Corticosteroids", "DMARDs"]
    }
}

def generate_data(num_samples):
    data = []
    
    for _ in range(num_samples):
        chronic_condition = random.choice(list(conditions.keys()))
        attributes = conditions[chronic_condition]
        
        record = {
            "patient_id": _ + 1,
            "name": fake.name(),
            "age": np.random.randint(18, 80),
            "location": fake.city(),
            "past_medication_done": random.choice(["Yes", "No"]),
            "allergies": np.random.choice(attributes["Allergies"], size=np.random.randint(1, 3), replace=False).tolist(),
            "chronic_condition": chronic_condition,
            "surgeries": np.random.choice(attributes["Surgeries"], size=np.random.randint(1, 2), replace=False).tolist(),
            "disease_condition": random.choice(attributes["Disease Conditions"]),
            "patient_condition": random.choice(["Stable", "Moderate", "Severe", "Improved", "Recovering"]),
            "previous_insurance_claims": random.choice([0, 1]),  # Single value for each record
            "insurance_plan": random.choice(["Basic", "Standard", "Premium"])  # Single value for each record
        }
        
        data.append(record)
    
    df = pd.DataFrame(data)
    return df

# Generate 5000 samples
df = generate_data(7000)

# Save as JSON
df.to_json('synthetic_insurance.json', orient='records', lines=True)

In [4]:
df

Unnamed: 0,patient_id,name,age,location,past_medication_done,allergies,chronic_condition,surgeries,disease_condition,patient_condition,previous_insurance_claims,insurance_plan
0,1,Laura Kerr,19,West Danielle,Yes,"[NSAIDs (in some cases), Shellfish]",Hypertension,[Coronary artery bypass],Heart disease,Recovering,1,Basic
1,2,Jennifer Shaffer,62,South Laurachester,Yes,"[Certain foods (e.g., shellfish, peanuts), Pen...",Diabetes Mellitus,[Amputation],Hypertension,Recovering,0,Standard
2,3,Julie Bailey,44,Sarahbury,Yes,"[Certain foods (e.g., shellfish, peanuts), Latex]",Diabetes Mellitus,[Amputation],Diabetic nephropathy,Moderate,0,Premium
3,4,Arthur Jenkins,74,South Terriborough,No,[Pollen],Chronic Obstructive Pulmonary Disease (COPD),[Lung volume reduction surgery],Chronic bronchitis,Moderate,0,Premium
4,5,Wendy Brown,32,South Carl,No,"[NSAIDs (in some cases), Shellfish]",Hypertension,[Coronary artery bypass],Heart disease,Moderate,0,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...
6995,6996,William Mathews,44,Simmonsborough,Yes,"[Certain inhalers (e.g., beta-agonists), Pollen]",Chronic Obstructive Pulmonary Disease (COPD),[Lung volume reduction surgery],Respiratory infections,Moderate,1,Standard
6996,6997,Ryan Long,36,Pageview,Yes,"[NSAIDs (in some cases), Certain medications (...",Hypertension,[Coronary artery bypass],Retinopathy,Improved,1,Basic
6997,6998,Anthony Miller,22,Reyeshaven,No,"[Certain anticoagulants (e.g., warfarin)]",Heart Disease,[Angioplasty and stent placement],Heart failure,Recovering,1,Premium
6998,6999,Thomas Ferrell,20,Christopherberg,No,[Sulfa drugs],Diabetes Mellitus,[Coronary artery bypass],Peripheral neuropathy,Moderate,1,Basic


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the generated data
df = pd.read_json('synthetic_insurance.json', orient='records', lines=True)
df

Unnamed: 0,patient_id,name,age,location,past_medication_done,allergies,chronic_condition,surgeries,disease_condition,patient_condition,previous_insurance_claims,insurance_plan
0,1,Laura Kerr,19,West Danielle,Yes,"[NSAIDs (in some cases), Shellfish]",Hypertension,[Coronary artery bypass],Heart disease,Recovering,1,Basic
1,2,Jennifer Shaffer,62,South Laurachester,Yes,"[Certain foods (e.g., shellfish, peanuts), Pen...",Diabetes Mellitus,[Amputation],Hypertension,Recovering,0,Standard
2,3,Julie Bailey,44,Sarahbury,Yes,"[Certain foods (e.g., shellfish, peanuts), Latex]",Diabetes Mellitus,[Amputation],Diabetic nephropathy,Moderate,0,Premium
3,4,Arthur Jenkins,74,South Terriborough,No,[Pollen],Chronic Obstructive Pulmonary Disease (COPD),[Lung volume reduction surgery],Chronic bronchitis,Moderate,0,Premium
4,5,Wendy Brown,32,South Carl,No,"[NSAIDs (in some cases), Shellfish]",Hypertension,[Coronary artery bypass],Heart disease,Moderate,0,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...
6995,6996,William Mathews,44,Simmonsborough,Yes,"[Certain inhalers (e.g., beta-agonists), Pollen]",Chronic Obstructive Pulmonary Disease (COPD),[Lung volume reduction surgery],Respiratory infections,Moderate,1,Standard
6996,6997,Ryan Long,36,Pageview,Yes,"[NSAIDs (in some cases), Certain medications (...",Hypertension,[Coronary artery bypass],Retinopathy,Improved,1,Basic
6997,6998,Anthony Miller,22,Reyeshaven,No,"[Certain anticoagulants (e.g., warfarin)]",Heart Disease,[Angioplasty and stent placement],Heart failure,Recovering,1,Premium
6998,6999,Thomas Ferrell,20,Christopherberg,No,[Sulfa drugs],Diabetes Mellitus,[Coronary artery bypass],Peripheral neuropathy,Moderate,1,Basic


In [7]:
# Preprocess the data for insurance prediction
def preprocess_insurance_data(df):
    # Convert lists to strings
    df['allergies'] = df['allergies'].apply(lambda x: ','.join(x) if isinstance(x, (list, np.ndarray)) else x)
    df['surgeries'] = df['surgeries'].apply(lambda x: ','.join(x) if isinstance(x, (list, np.ndarray)) else x)
    
    # Drop unnecessary columns for modeling
    df = df.drop(['location', 'name'], axis=1)
    
    return df

# Preprocess the data
df_preprocessed = preprocess_insurance_data(df)

# Define feature columns and target column
feature_columns = df_preprocessed.columns[df_preprocessed.columns != 'insurance_plan']
target_column = 'insurance_plan'

# Separate features and target
X = df_preprocessed[feature_columns]
y = df_preprocessed[target_column]

# Encoding pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ]
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_test

Unnamed: 0,patient_id,age,past_medication_done,allergies,chronic_condition,surgeries,disease_condition,patient_condition,previous_insurance_claims
6500,6501,70,No,"Dust,Certain inhalers (e.g., beta-agonists)",Chronic Obstructive Pulmonary Disease (COPD),Lung volume reduction surgery,Chronic bronchitis,Recovering,0
2944,2945,77,Yes,"Certain inhalers (e.g., beta-agonists)",Chronic Obstructive Pulmonary Disease (COPD),Lung volume reduction surgery,Emphysema,Severe,0
2024,2025,22,Yes,"Contrast dyes,Certain anticoagulants (e.g., wa...",Heart Disease,Angioplasty and stent placement,Myocardial infarction,Recovering,1
263,264,43,Yes,"Sulfa drugs,Penicillin",Diabetes Mellitus,Bariatric surgery,Diabetic nephropathy,Severe,0
4350,4351,39,No,"Certain medications (e.g., ACE inhibitors),NSA...",Hypertension,Coronary artery bypass,Stroke,Moderate,0
...,...,...,...,...,...,...,...,...,...
5945,5946,29,No,"Certain anticoagulants (e.g., warfarin)",Heart Disease,Valve replacement or repair,Peripheral artery disease,Severe,0
1630,1631,23,No,"Dust,Certain inhalers (e.g., beta-agonists)",Chronic Obstructive Pulmonary Disease (COPD),Lung volume reduction surgery,Heart failure,Moderate,0
4043,4044,79,No,Mold,Chronic Obstructive Pulmonary Disease (COPD),Lung volume reduction surgery,Heart failure,Stable,1
1881,1882,64,No,"Certain inhalers (e.g., beta-agonists)",Chronic Obstructive Pulmonary Disease (COPD),Lung transplant,Respiratory infections,Stable,0


In [8]:
# Train the model
model_pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model_pipeline.predict(X_test)

print("Insurance Prediction Evaluation:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Insurance Prediction Evaluation:
              precision    recall  f1-score   support

       Basic       0.32      0.34      0.33       685
     Premium       0.32      0.29      0.30       683
    Standard       0.33      0.33      0.33       732

    accuracy                           0.32      2100
   macro avg       0.32      0.32      0.32      2100
weighted avg       0.32      0.32      0.32      2100

Confusion Matrix:
[[234 202 249]
 [237 201 245]
 [256 234 242]]


In [9]:
import pandas as pd
import joblib


# Define custom input data
custom_input = pd.DataFrame({
    'patient_id':[3],
    'age': [44],
    'past_medication_done':['yes'],
    'allergies': [["Certain foods (e.g., shellfish, peanuts),Latex"]],
    'chronic_condition': ['Diabetes Mellitus'],
    'surgeries': ['Amputation'],
    'disease_condition':['Diabetic nephropathy'],
    'patient_condition': ['Moderate'],
    'previous_insurance_claims': [0]  # Example value
})



# Preprocess the custom input data
def preprocess_custom_data(df):
    df['allergies'] = df['allergies'].apply(lambda x: ','.join(x) if isinstance(x, (list, np.ndarray)) else x)
    df['surgeries'] = df['surgeries'].apply(lambda x: ','.join(x) if isinstance(x, (list, np.ndarray)) else x)
    
    return df

custom_input_preprocessed = preprocess_custom_data(custom_input)

# Predict using the model pipeline
custom_prediction = model_pipeline.predict(custom_input_preprocessed)

# Output the prediction
print("Predicted Insurance Plan:", custom_prediction[0])


Predicted Insurance Plan: Premium


In [10]:

joblib.dump(model_pipeline, 'insurance_prediction.pkl')
print("Insurance prediction model pipeline saved as 'insurance_prediction.pkl'")

Insurance prediction model pipeline saved as 'insurance_prediction.pkl'


### 

# Resource checking

# 

In [50]:
import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()

def generate_data(num_samples):
    data = {
        "patient_id": range(1, num_samples + 1),
        "name": [fake.name() for _ in range(num_samples)],
        "age": np.random.randint(18, 80, size=num_samples),
        "doctor_availability": np.random.randint(0, 10, size=num_samples),
        "bed_availability": np.random.randint(0, 100, size=num_samples),
        "nurse_availability": np.random.randint(0, 20, size=num_samples),
        "patient_condition": np.random.choice(["Critical", "Moderate", "Stable"], size=num_samples),
        "previous_insurance_claims": np.random.randint(0, 5, size=num_samples),
        "admission_recommendation": np.random.choice(["Admit to Hospital", "Refer to Other Hospital"], size=num_samples)  # Categorical target
    }
    return pd.DataFrame(data)

# Generate 1000 samples
df = generate_data(7000)

df

Unnamed: 0,patient_id,name,age,doctor_availability,bed_availability,nurse_availability,patient_condition,previous_insurance_claims,admission_recommendation
0,1,Kimberly King,39,1,14,10,Critical,0,Refer to Other Hospital
1,2,Lori Martin,71,2,37,6,Moderate,2,Admit to Hospital
2,3,Tracy Morgan,70,1,68,17,Moderate,4,Admit to Hospital
3,4,Hunter Fuller,75,7,4,16,Moderate,3,Refer to Other Hospital
4,5,Danielle Smith,50,9,81,13,Critical,0,Admit to Hospital
...,...,...,...,...,...,...,...,...,...
6995,6996,Scott Lewis,72,8,57,5,Stable,0,Admit to Hospital
6996,6997,Richard Jones,46,1,60,8,Stable,2,Admit to Hospital
6997,6998,Douglas Escobar,30,4,15,3,Critical,2,Refer to Other Hospital
6998,6999,Ashley Macdonald MD,70,8,16,11,Stable,4,Refer to Other Hospital


In [51]:
# Save as JSON
df.to_json('hospital_admission_data.json', orient='records', lines=True)

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
import joblib

# Load the generated data
df = pd.read_json('hospital_admission_data.json', orient='records', lines=True)

In [57]:


# Preprocess the data for admission prediction
def preprocess_admission_data(df):
    # Drop columns that won't be used in the model
    df = df.drop(columns=['name','previous_insurance_claims', 'patient_id'])
    
    # Initialize LabelEncoder
    le_patient_condition = LabelEncoder()
    le_admission_recommendation = LabelEncoder()
    
    # Fit LabelEncoders
    df['patient_condition'] = le_patient_condition.fit_transform(df['patient_condition'])
    df['admission_recommendation'] = le_admission_recommendation.fit_transform(df['admission_recommendation'])
    
    return df, le_patient_condition, le_admission_recommendation

df_preprocessed, le_patient_condition, le_admission_recommendation = preprocess_admission_data(df)

# Define feature columns and target column
feature_columns = df_preprocessed.columns[df_preprocessed.columns != 'admission_recommendation']
target_column = 'admission_recommendation'

# Separate features and target
X = df_preprocessed[feature_columns]
y = df_preprocessed[target_column]

# Create the pipeline
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=250, random_state=42))
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Save the model and LabelEncoders
joblib.dump(pipeline, 'hospital_admission_model.pkl')
joblib.dump(le_patient_condition, 'label_encoder_patient_condition.pkl')
joblib.dump(le_admission_recommendation, 'label_encoder_admission_recommendation.pkl')

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)
print("Hospital Admission Prediction Evaluation:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Hospital Admission Prediction Evaluation:
              precision    recall  f1-score   support

           0       0.52      0.54      0.53       712
           1       0.50      0.49      0.50       688

    accuracy                           0.51      1400
   macro avg       0.51      0.51      0.51      1400
weighted avg       0.51      0.51      0.51      1400

Confusion Matrix:
[[381 331]
 [351 337]]


In [58]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

# Define sample custom data
custom_data = pd.DataFrame({
    'age': [30, 65, 50],  # Example ages
    'doctor_availability': [4, 1, 6],  # Example availability of doctors
    'bed_availability': [12, 30, 25],  # Example availability of beds
    'nurse_availability': [6, 2, 7],  # Example availability of nurses
    'patient_condition': ['Stable', 'Critical', 'Moderate'],  # Example patient conditions
})

# Load the trained model pipeline and LabelEncoders
pipeline = joblib.load('hospital_admission_model.pkl')


# Preprocess the sample data
def preprocess_custom_data(df):
    df = df.copy()  # Avoid modifying the original DataFrame
    df = df.drop(columns=['patient_id'], errors='ignore')  # Drop any non-required columns
    
    # Encode categorical variables using the loaded label encoder
    df['patient_condition'] = le_patient_condition.transform(df['patient_condition'])
    
    return df

custom_data_preprocessed = preprocess_custom_data(custom_data)

# Ensure custom data is in the correct shape for prediction
X_custom_data = custom_data_preprocessed.values

# Make predictions
predictions = pipeline.predict(X_custom_data)

# Convert predictions to a readable format
admission_recommendations = le_admission_recommendation.inverse_transform(predictions)

# Create a DataFrame to display results
custom_data['predicted_admission'] = admission_recommendations

print(custom_data)


   age  doctor_availability  bed_availability  nurse_availability  \
0   30                    4                12                   6   
1   65                    1                30                   2   
2   50                    6                25                   7   

  patient_condition      predicted_admission  
0            Stable        Admit to Hospital  
1          Critical        Admit to Hospital  
2          Moderate  Refer to Other Hospital  


