In [32]:
import pandas as pd
import numpy as np
data = pd.read_csv('Medical_data.csv')

In [33]:
pd.set_option('display.max_rows', None)


In [34]:
data.head()

Unnamed: 0,Id,Name,Gender,Age,ICD-10 Code,Procedure Description,Previous Surgeries,Length_of_Stay,Number_of_Procedures,Lab_Results,Imaging_Results,Insurance_claim_percentage,Bill_Amount
0,788,Aaryan Nair,Male,55,S52.512A,Fracture of Shaft of Radius,Yes,5,2,Abnormal,Normal,80,408000.0
1,330,Matteo Patel,Male,48,K56.60,Irritable Bowel Syndrome Management,Yes,7,3,Normal,Normal,100,300000.0
2,165,Lucas Kumar,Male,27,J45.909,Severe Asthma Treatment,No,3,1,Normal,Normal,100,160000.0
3,17,Krishna Kumar,Male,48,S72.309A,Fracture of Sacrum,Yes,5,2,Abnormal,Abnormal,60,420000.0
4,303,Matteo Patel,Male,48,G44.209,Tension Headache Management,Yes,6,3,Abnormal,Normal,70,296000.0


In [35]:
data.shape

(3204, 13)

In [36]:
len(data['ICD-10 Code'].unique()),len(data['Procedure Description'].unique())

(50, 50)

In [37]:
s = data['ICD-10 Code'].unique()

In [38]:
to_drop = [
    'Id',
    'Name',
    'Gender',
    'Procedure Description',
    'Insurance_claim_percentage',
]


In [39]:
data.drop(to_drop, inplace=True, axis=1)

In [40]:
data.head()

Unnamed: 0,Age,ICD-10 Code,Previous Surgeries,Length_of_Stay,Number_of_Procedures,Lab_Results,Imaging_Results,Bill_Amount
0,55,S52.512A,Yes,5,2,Abnormal,Normal,408000.0
1,48,K56.60,Yes,7,3,Normal,Normal,300000.0
2,27,J45.909,No,3,1,Normal,Normal,160000.0
3,48,S72.309A,Yes,5,2,Abnormal,Abnormal,420000.0
4,48,G44.209,Yes,6,3,Abnormal,Normal,296000.0


In [41]:
data.isnull().sum()

Age                     0
ICD-10 Code             0
Previous Surgeries      0
Length_of_Stay          0
Number_of_Procedures    0
Lab_Results             0
Imaging_Results         0
Bill_Amount             1
dtype: int64

In [42]:
data = data.dropna()

In [43]:
data.isnull().sum()

Age                     0
ICD-10 Code             0
Previous Surgeries      0
Length_of_Stay          0
Number_of_Procedures    0
Lab_Results             0
Imaging_Results         0
Bill_Amount             0
dtype: int64

In [44]:
import plotly.graph_objects as go

# data for ICD-10_Code
model_names = data['ICD-10 Code'].value_counts().index
model_accuracies = data['ICD-10 Code'].value_counts()

# Create the chart
fig = go.Figure()
fig.add_trace(go.Bar(x=model_names, y=model_accuracies, marker_color='rosybrown'))
fig.update_layout(title='Graphical representation of ICD-10_Code ', xaxis_title='ICD-10_Code', yaxis_title='Number of counts', yaxis_range=[0, 140])

# Show the chart
fig.show()

In [45]:
lab_result_order = data.groupby('Lab_Results').agg({'Bill_Amount':'mean'}).sort_values(by='Bill_Amount', ascending=False).index.tolist()
imaging_order = data.groupby('Imaging_Results').agg({'Bill_Amount':'mean'}).sort_values(by='Bill_Amount', ascending=False).index.tolist()
ICD_10_Code_order = data.groupby('ICD-10 Code').agg({'Bill_Amount':'mean'}).sort_values(by='Bill_Amount', ascending=False).index.tolist()
Previous_Surgeries_order = data.groupby('Previous Surgeries').agg({'Bill_Amount':'mean'}).sort_values(by='Bill_Amount', ascending=False).index.tolist()


In [46]:
from sklearn.preprocessing import  OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

In [47]:

lab_encoder = OrdinalEncoder(categories=[lab_result_order])
imaging_encoder = OrdinalEncoder(categories=[imaging_order])
ICD_10_Code_encoder = OrdinalEncoder(categories=[ICD_10_Code_order])
Previous_Surgeries_encoder =OrdinalEncoder(categories=[Previous_Surgeries_order])

In [48]:
X = data.drop('Bill_Amount', axis=1)
y = data['Bill_Amount'].values

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [50]:
X_train['Lab_Results'] = lab_encoder.fit_transform(X_train[['Lab_Results']].values)
X_train['Imaging_Results'] = imaging_encoder.fit_transform(X_train[['Imaging_Results']].values)
X_train['ICD-10 Code'] = ICD_10_Code_encoder.fit_transform(X_train[['ICD-10 Code']].values)
X_train['Previous Surgeries'] = Previous_Surgeries_encoder.fit_transform(X_train[['Previous Surgeries']].values)

In [51]:
X_test['Lab_Results'] = lab_encoder.transform(X_test[['Lab_Results']].values)
X_test['Imaging_Results'] = imaging_encoder.transform(X_test[['Imaging_Results']].values)
X_test['ICD-10 Code'] = ICD_10_Code_encoder.transform(X_test[['ICD-10 Code']].values)
X_test['Previous Surgeries'] = Previous_Surgeries_encoder.transform(X_test[['Previous Surgeries']].values)

In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [53]:
def summarize_model(model):

    model.fit(X_train, y_train) #fit the model
    model_train_preds = model.predict(X_train) #predict for train set
    model_test_preds = model.predict(X_test) #predict for test set
    model_r2_train = r2_score(y_true=y_train, y_pred=model_train_preds) #calculate r2 for train set
    model_r2_test = r2_score(y_true=y_test, y_pred=model_test_preds) #calculate r2 for test set
    model_mae_train = mean_absolute_error(y_true=y_train,y_pred=model_train_preds) #calcaulte mae for train set
    model_mae_test = mean_absolute_error(y_true=y_test, y_pred=model_test_preds) #calculate mae for test set

    # store the metrics in a dictionary
    metrics_dict = {
        'train r2':[model_r2_train],
        'test r2':[model_r2_test],
        'train mae':[model_mae_train],
        'test mae':[model_mae_test]
    }
    display(pd.DataFrame(metrics_dict))
    return model, metrics_dict #return model and metrics

In [54]:
#Xgboost Regression
xgb_dict, xgb_metrics = summarize_model(XGBRegressor(objective='reg:squarederror'))
xgb_train_score = xgb_metrics.get('train r2')[0]
xgb_test_score = xgb_metrics.get('test r2')[0]

Unnamed: 0,train r2,test r2,train mae,test mae
0,0.982105,0.915616,4644.279484,11587.510887


In [55]:
#LinearRegression
lr_dict, lr_metrics = summarize_model(LinearRegression())
lr_train_score = lr_metrics.get('train r2')[0]
lr_test_score = lr_metrics.get('test r2')[0]

Unnamed: 0,train r2,test r2,train mae,test mae
0,0.648476,0.659771,59785.304435,58545.013342


In [56]:
#GradientBoostingRegressor
gb_dict, gb_metrics = summarize_model(GradientBoostingRegressor())
gb_train_score = gb_metrics.get('train r2')[0]
gb_test_score = gb_metrics.get('test r2')[0]

Unnamed: 0,train r2,test r2,train mae,test mae
0,0.896015,0.874726,25921.961447,28475.378111


In [57]:
#AdaBoostRegressor
adaboost_dict, adaboost_metrics = summarize_model(AdaBoostRegressor())
adaboost_train_score = adaboost_metrics.get('train r2')[0]
adaboost_test_score = adaboost_metrics.get('test r2')[0]

Unnamed: 0,train r2,test r2,train mae,test mae
0,0.666794,0.674738,63148.213151,63577.494748


In [58]:
#RandomForestRegressor
rf_dict, rf_metrics = summarize_model(RandomForestRegressor())
rf_train_score = rf_metrics.get('train r2')[0]
rf_test_score = rf_metrics.get('test r2')[0]

Unnamed: 0,train r2,test r2,train mae,test mae
0,0.978861,0.925671,5474.870597,11109.613332


In [59]:
#DecisionTreeRegressor
dt_dict, dt_metrics = summarize_model(DecisionTreeRegressor())
dt_train_score = dt_metrics.get('train r2')[0]
dt_test_score = dt_metrics.get('test r2')[0]

Unnamed: 0,train r2,test r2,train mae,test mae
0,0.985006,0.905015,2960.928026,11331.795989


In [60]:
import plotly.graph_objects as go

# data for model accuracies
model_names = ['AdaBoostRegressor', 'LinearRegression', 'GradientBoostingRegressor', 'RandomForestRegressor', 'DecisionTreeRegressor','XGBRegressor']
model_train_accuracies = [adaboost_train_score, lr_train_score, gb_train_score, rf_train_score, dt_train_score, xgb_train_score]
model_test_accuracies = [adaboost_test_score, lr_test_score, gb_test_score, rf_test_score, dt_test_score, xgb_test_score]
# Create the chart
fig = go.Figure()
fig.add_trace(go.Bar(x=model_names, y=model_train_accuracies, marker_color='skyblue',name = 'Train R2-Scores'))
fig.add_trace(go.Bar(x=model_names, y=model_test_accuracies, marker_color=' lightgreen',name='Test R2-Scores'))

fig.update_xaxes(tickangle=45)
fig.update_layout(title='Comparison of R2-Scores', xaxis_title='Models', yaxis_title='Accuracy', yaxis_range=[0, 1.0])
# Show the chart
fig.show()


In [61]:
import plotly.graph_objects as go

# Data for R2-Score models
model_names = ['AdaBoostRegressor', 'LinearRegression', 'GradientBoostingRegressor', 'RandomForestRegressor', 'DecisionTreeRegressor', 'XGBRegressor']
model_train_accuracies = [adaboost_train_score, lr_train_score, gb_train_score, rf_train_score, dt_train_score, xgb_train_score]  # Replace these values with the training accuracies of each model
model_test_accuracies =  [adaboost_test_score, lr_test_score, gb_test_score, rf_test_score, dt_test_score, xgb_test_score] # Replace these values with the test accuracies of each model

# Create the line plot for train accuracies
fig = go.Figure()
fig.add_trace(go.Scatter(x=model_names, y=model_train_accuracies, mode='lines+markers', name='Train R2-Scores', line=dict(color='skyblue')))

# Create the line plot for test accuracies
fig.add_trace(go.Scatter(x=model_names, y=model_test_accuracies, mode='lines+markers', name='Test R2-Scores', line=dict(color='lightgreen')))

# Customize the layout
fig.update_layout(title='Comparison of R2-Scores', xaxis_title='Models', yaxis_title='R2-Score', yaxis_range=[0.6, 1.0])

# Show the chart
fig.show()


In [62]:
age = int(input('Enter your age -> '))
stay = int(input('Enter the estimated stay doctor gave you ->'))
procedures = int(input('How many procedures are you undertaking -> '))
code  = input('which ICD_code_10 doctor suggested ->')
previous_surgeries = input('previous_surgeries ->')
lab = input('how were your lab results -> ')
imaging = input('how were your imaging results -> ')

sample = {
    'Age': age,
    'ICD-10 Code':code,
    'Previous Surgeries':previous_surgeries,
    'Length_of_Stay':stay,
    'Number_of_Procedures':procedures,
    'Lab_Results':lab,
    'Imaging_Results':imaging
}
sample_df = pd.DataFrame(sample,index=[0])
sample_df['Lab_Results'] = lab_encoder.transform(sample_df['Lab_Results'].values.reshape(-1,1))
sample_df['Imaging_Results'] = imaging_encoder.transform(sample_df['Imaging_Results'].values.reshape(-1,1))
sample_df['Previous Surgeries'] = Previous_Surgeries_encoder.transform(sample_df['Previous Surgeries'].values.reshape(-1,1))
sample_df['ICD-10 Code'] = ICD_10_Code_encoder.transform(sample_df['ICD-10 Code'].values.reshape(-1,1))
sample_df
model = xgb_dict
pred = model.predict(sample_df)
print("Expected Amount Rs.",int(*pred),sep="")

Enter your age -> 23
Enter the estimated stay doctor gave you ->4
How many procedures are you undertaking -> 2
which ICD_code_10 doctor suggested ->M17.9
previous_surgeries ->Yes
how were your lab results -> Normal
how were your imaging results -> Normal
Expected Amount Rs.313047
