# Regression Modeling

In [16]:
import numpy as np
import pandas as pd

In [17]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [18]:
from sklearn import set_config
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer

set_config(transform_output='pandas')

In [19]:
data = './data/law_group_case_data_with_noise_and_outliers.xlsx'

df = pd.read_excel(data)
df

Unnamed: 0,case_type,case_status,marketing_source,injury_type,medical_expense_amount,potential_settlement_amount,attorney_name,injury_severity_score,client_age,client_gender,case_priority,insurance_claim_made,insured,next_action
0,Personal Injury,Open,TV,Soft Tissue Injury,23517,86381,Sarah K. Connor,1,26,Non-binary,Medium,True,0,Await Documents
1,Medical Malpractice,Open,Online,Laceration,25015,58349,William J. Thompson,4,21,Male,Low,True,1,Schedule Meeting
2,Personal Injury,In Progress,Radio,Soft Tissue Injury,104282,83816,Elizabeth M. Harris,2,87,Male,Medium,False,0,Await Documents
3,Medical Malpractice,Settled,Referral,Laceration,10403,99481,Jonathan P. Reed,9,45,Non-binary,High,True,0,Follow up
4,Personal Injury,Closed,Referral,Fracture,15198,85524,Elizabeth M. Harris,1,66,Non-binary,Medium,False,0,Schedule Meeting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Personal Injury,Closed,Online,Burn,11765,85042,William J. Thompson,4,58,Female,Medium,True,0,Await Documents
49996,Personal Injury,Open,Billboard,Soft Tissue Injury,7062,91998,Michael A. Stevens,3,76,Male,Low,True,0,Await Documents
49997,Medical Malpractice,Open,TV,Fracture,14490,71351,William J. Thompson,8,30,Non-binary,Low,True,0,Schedule Meeting
49998,Medical Malpractice,Settled,Billboard,Laceration,16181,71712,Michael A. Stevens,5,75,Male,Low,True,0,Follow up


In [20]:
df.isna().sum()

case_type                      0
case_status                    0
marketing_source               0
injury_type                    0
medical_expense_amount         0
potential_settlement_amount    0
attorney_name                  0
injury_severity_score          0
client_age                     0
client_gender                  0
case_priority                  0
insurance_claim_made           0
insured                        0
next_action                    0
dtype: int64

In [21]:
drop_cols = ['case_status']
df = df.drop(columns = drop_cols)
df

Unnamed: 0,case_type,marketing_source,injury_type,medical_expense_amount,potential_settlement_amount,attorney_name,injury_severity_score,client_age,client_gender,case_priority,insurance_claim_made,insured,next_action
0,Personal Injury,TV,Soft Tissue Injury,23517,86381,Sarah K. Connor,1,26,Non-binary,Medium,True,0,Await Documents
1,Medical Malpractice,Online,Laceration,25015,58349,William J. Thompson,4,21,Male,Low,True,1,Schedule Meeting
2,Personal Injury,Radio,Soft Tissue Injury,104282,83816,Elizabeth M. Harris,2,87,Male,Medium,False,0,Await Documents
3,Medical Malpractice,Referral,Laceration,10403,99481,Jonathan P. Reed,9,45,Non-binary,High,True,0,Follow up
4,Personal Injury,Referral,Fracture,15198,85524,Elizabeth M. Harris,1,66,Non-binary,Medium,False,0,Schedule Meeting
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Personal Injury,Online,Burn,11765,85042,William J. Thompson,4,58,Female,Medium,True,0,Await Documents
49996,Personal Injury,Billboard,Soft Tissue Injury,7062,91998,Michael A. Stevens,3,76,Male,Low,True,0,Await Documents
49997,Medical Malpractice,TV,Fracture,14490,71351,William J. Thompson,8,30,Non-binary,Low,True,0,Schedule Meeting
49998,Medical Malpractice,Billboard,Laceration,16181,71712,Michael A. Stevens,5,75,Male,Low,True,0,Follow up


In [22]:
target = 'potential_settlement_amount'

X = df.drop(columns = target)
y = df[target]

In [25]:
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['number']).columns

In [28]:
## Create separate pipelines for numerical and categorical columns

cat_pipeline = Pipeline([('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist',
                                                   sparse_output=False,
                                                   min_frequency=10,))])
                                                #    max_categories = 25))])

num_pipeline = Pipeline([('scaler', StandardScaler())]) 
# num_pipeline = Pipeline([('scaler', PowerTransformer(method='yeo-johnson'))])  

## Combine transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(transformers=[('num', num_pipeline, num_cols),
                                               ('cat', cat_pipeline, cat_cols)])

model = RandomForestRegressor(random_state=42)

model_pipeline = Pipeline([["preprocessor", preprocessor],
                            ["model", model]])

model_pipeline.fit(X, y)

# Evaluate the best model using cross_val_score
cv_scores = cross_val_score(model_pipeline, X, y, n_jobs=-1)

print(f'\nCross-Validation Scores: {cv_scores}\n'
      f'Mean CV Score: {cv_scores.mean():,.3f}\n'
      f'Standard Deviation of CV Scores: {cv_scores.std():,.3f}\n')



Cross-Validation Scores: [-0.05781104 -0.0646084  -0.0389154  -0.03596427 -0.04639688]
Mean CV Score: -0.049
Standard Deviation of CV Scores: 0.011

