In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from imblearn.over_sampling import SMOTE
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Load dataset
hdn = pd.read_csv('https://github.com/2256haradityam/dataset/raw/main/hdn.csv')


# Remove outliers based on the Interquartile Range (IQR)
Q1 = hdn.quantile(0.25)
Q3 = hdn.quantile(0.75)
IQR = Q3 - Q1
hdn = hdn[~((hdn < (Q1 - 1.5 * IQR)) | (hdn > (Q3 + 1.5 * IQR))).any(axis=1)]

# Check for correlation and drop less significant columns
# Separate features and target variable
x = hdn.drop('charges', axis=1)
y = hdn['charges']

# Identify numerical and categorical columns
numerical_cols = x.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = x.select_dtypes(include=['object']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing
x_preprocessed = preprocessor.fit_transform(x)

# Split dataset into training, validation, and test sets
x_train, x_test, y_train, y_test = train_test_split(x_preprocessed, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Define the number of folds for cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Define a scorer for cross-validation
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'K-Neighbors': KNeighborsRegressor(),
    'XGBoost': XGBRegressor(objective='reg:squarederror')
}

# Train models using k-fold cross-validation and evaluate them
evaluation_results = {}

for name, model in models.items():
    # Cross-validation
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scorer)
    model.fit(x_train, y_train)

    # Predict on validation and test set
    y_val_pred = model.predict(x_val)
    y_test_pred = model.predict(x_test)

    # Calculate metrics
    evaluation_results[name] = {
        'VMAE': mean_absolute_error(y_val, y_val_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'MSE': mean_squared_error(y_test, y_test_pred),
        'R²': r2_score(y_test, y_test_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'Cross-Validated MAE': -cv_results.mean()
    }

# Display evaluation results
for name, metrics in evaluation_results.items():
    print(f"Model: {name}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

# Determine the best model based on cross-validated MAE
best_model_name = min(evaluation_results, key=lambda x: evaluation_results[x]['Cross-Validated MAE'])
best_model = models[best_model_name]

print(f"The best model is: {best_model_name}")


Model: Linear Regression
  VMAE: 3444.3239
  MAE: 3494.8609
  MSE: 24868266.3992
  R²: 0.3557
  RMSE: 4986.8092
  Cross-Validated MAE: 3792.5752
Model: Ridge Regression
  VMAE: 3444.7731
  MAE: 3494.4672
  MSE: 24861371.6619
  R²: 0.3559
  RMSE: 4986.1179
  Cross-Validated MAE: 3792.0919
Model: Lasso Regression
  VMAE: 3444.4140
  MAE: 3494.7869
  MSE: 24866928.3599
  R²: 0.3557
  RMSE: 4986.6751
  Cross-Validated MAE: 3792.5088
Model: Decision Tree
  VMAE: 3146.3254
  MAE: 3292.5112
  MSE: 23764586.8569
  R²: 0.3843
  RMSE: 4874.8935
  Cross-Validated MAE: 3468.7892
Model: Random Forest
  VMAE: 3144.4433
  MAE: 3275.0167
  MSE: 23614206.4013
  R²: 0.3882
  RMSE: 4859.4451
  Cross-Validated MAE: 3488.6537
Model: Gradient Boosting
  VMAE: 3148.2606
  MAE: 3334.2157
  MSE: 23723440.9080
  R²: 0.3854
  RMSE: 4870.6715
  Cross-Validated MAE: 3481.0552
Model: K-Neighbors
  VMAE: 2862.3188
  MAE: 2932.5581
  MSE: 25745116.2791
  R²: 0.3330
  RMSE: 5073.9646
  Cross-Validated MAE: 3473.3178
M

In [None]:
# Fit the XGBoost model to the training data
model = XGBRegressor(objective='reg:squarederror')
model.fit(x_train, y_train)  # Fit the model here

# Now you can extract feature importances
feature_importances = model.feature_importances_

# Get feature names from the preprocessor
# (assuming 'preprocessor' is the ColumnTransformer object you defined earlier)
feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame to view feature importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

              Feature  Importance
0  num__Hospital tier    0.857985
2            num__age    0.142015
1       num__State ID    0.000000
3         num__smoker    0.000000


In [None]:
print(x_train)

[[-0.33110189  0.          0.92005983  0.        ]
 [-0.33110189  0.          1.65473767  0.        ]
 [-0.33110189  0.          0.55272091  0.        ]
 ...
 [-0.33110189  0.         -0.91663476  0.        ]
 [-0.33110189  0.          1.65473767  0.        ]
 [ 1.444432    0.          1.65473767  0.        ]]


In [None]:
# Provided data to be cleaned and used for prediction
data = {
    'Name': 'Christopher, Ms. Jayna',
    'Date of Birth': '12/28/1988',
    'Height': 170,
    'Weight': 85,
    'City Tier': 'tier-1',
    'State ID': 'R1011',
    'Diabetic': False,
    'HbA1c': 5.8,
    'Smoker': True,
    'Health Status': 'Healthy',
    'Major Surgeries': False,
    'Father Cause of Death': 'Lung Cancer'
}

# Function to calculate age based on date of birth
def calculate_age(dob, current_year=2024):
    birth_year = int(dob.split('/')[-1])
    return current_year - birth_year

# Cleaned data with only necessary features for prediction
cleaned_data = {
    'Hospital tier': 1 if data['City Tier'] == 'tier-1' else 0,
    'age': calculate_age(data['Date of Birth']),
    'State ID': 0,
    'smoker': 0
}

# Convert cleaned data to DataFrame
input_data = pd.DataFrame([cleaned_data])

# Ensure the input data matches the model's expected format
x_input = preprocessor.transform(input_data)

# Make predictions using the best model
predicted_cost = best_model.predict(x_input)

print(f"Estimated Hospitalization Cost for {data['Name']}: ${predicted_cost[0]:.2f}")


Estimated Hospitalization Cost for Christopher, Ms. Jayna: $28000.00


In [None]:
hdn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 858 entries, 294 to 2321
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   charges        858 non-null    int64
 1   Hospital tier  858 non-null    int64
 2   State ID       858 non-null    int64
 3   age            858 non-null    int64
 4   smoker         858 non-null    int64
dtypes: int64(5)
memory usage: 40.2 KB
