# Regression Modeling of Amount

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
from sklearn import set_config
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, PowerTransformer, StandardScaler

set_config(transform_output='pandas')

# Load Data

In [4]:
df = pd.read_parquet('../data/medicalmalpractice_cleaned.parquet')
df

Unnamed: 0,Amount,Severity,Age,Private Attorney,Marital Status,Specialty,Insurance,Gender
0,57041,7,62,Private,Married,Family Practice,Private,Male
1,324976,6,38,Private,Married,OBGYN,No Insurance,Female
2,135383,4,34,Private,Married,Cardiology,Unknown,Male
3,829742,7,42,Private,Single,Pediatrics,No Insurance,Female
4,197675,3,60,Not Private,Married,OBGYN,Medicare/Medicaid,Female
...,...,...,...,...,...,...,...,...
79205,25305,4,64,Private,Married,General Surgery,Unknown,Male
79206,43098,3,87,Not Private,Unknown,Orthopedic Surgery,Unknown,Male
79207,35398,3,81,Not Private,Unknown,Anesthesiology,Unknown,Male
79208,154228,9,19,Not Private,Unknown,Dermatology,Unknown,Female


In [5]:
target = 'Amount'

X = df.drop(columns=target)
y = df[[target]]

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
cat_cols = X.select_dtypes(include='category').columns
num_cols = X.select_dtypes(include=np.number).columns

cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False, max_categories=10))])

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                     ('scaler', StandardScaler())])

preprocessor = ColumnTransformer([('cat', cat_pipe, cat_cols),
                                  ('num', num_pipe, num_cols)])

# Add Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Define the PowerTransformer
power_transformer = PowerTransformer()

# Define the Ridge model
ridge = Ridge()

# Create the pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('regressor', TransformedTargetRegressor(regressor=ridge, transformer=power_transformer))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__regressor__alpha': [0.1, 1.0, 10.0],
    'poly__degree': [1, 2, 3]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X, y)

# Get the best model
best_model = grid_search.best_estimator_

# Cross-validation scores of the best model
cv_scores = cross_val_score(best_model, X, y, cv=5)

print(f'Best parameters: {grid_search.best_params_}')
print(f'CV scores: {np.mean(cv_scores).round(2)}')



Best parameters: {'poly__degree': 3, 'regressor__regressor__alpha': 0.1}
CV scores: 0.12




In [8]:
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False, max_categories=10))])

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                     ('scaler', StandardScaler())])

preprocessor = ColumnTransformer([('cat', cat_pipe, cat_cols),
                                  ('num', num_pipe, num_cols)])

# Add Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Define the PowerTransformer
power_transformer = PowerTransformer()

# Define the Ridge model
ridge = Ridge()

# Create the pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('regressor', TransformedTargetRegressor(regressor=ridge, transformer=power_transformer))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__regressor__alpha': [0.1, 1.0, 10.0],
    'poly__degree': [1, 2, 3]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X, y)

# Get the best model
best_model = grid_search.best_estimator_

# Cross-validation scores of the best model
cv_scores = cross_val_score(best_model, X, y, cv=5)

print(f'Best parameters: {grid_search.best_params_}')
print(f'CV scores: {np.mean(cv_scores).round(2)}')



Best parameters: {'poly__degree': 3, 'regressor__regressor__alpha': 0.1}
CV scores: 0.12




In [9]:
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant',
                                               fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist',
                                               sparse_output=False,
                                               max_categories=10))])

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                     ('scaler', StandardScaler())])

preprocessor = ColumnTransformer([('cat', cat_pipe, cat_cols),
                                  ('num', num_pipe, num_cols)])

# Add Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Define the PowerTransformer
power_transformer = PowerTransformer()

# Define the HistGradientBoostingRegressor model
model_type = HistGradientBoostingRegressor()

# Create the pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('regressor', TransformedTargetRegressor(regressor=model_type,
                                             transformer=power_transformer))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__regressor__max_iter': [100, 200],
    'regressor__regressor__learning_rate': [0.01, 0.1],
    'poly__degree': [1, 2, 3]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X, y)

# Get the best model
best_model = grid_search.best_estimator_

# Cross-validation scores of the best model
cv_scores = cross_val_score(best_model, X, y, cv=5)

print(f'Best parameters: {grid_search.best_params_}')
print(f'CV scores: {np.mean(cv_scores).round(2)}')



Best parameters: {'poly__degree': 2, 'regressor__regressor__learning_rate': 0.1, 'regressor__regressor__max_iter': 200}
CV scores: 0.2


