# Introduction


In [24]:
# Data Manipulation and Exploration
import pandas as pd
import numpy as np
import datasist as ds
import openpyxl

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical Modeling
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Data Imputation
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

# Machine Learning: Regression
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Model Evaluation (Regression)
from sklearn.metrics import mean_squared_error, r2_score

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Model Persistence
from joblib import dump

# Machine Learning: Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Model Evaluation (Classification)
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Model Selection and Hyperparameter Tuning
from sklearn.model_selection import train_test_split, GridSearchCV
# Warnings Management
import warnings
warnings.filterwarnings('ignore')

Load the dataset

In [35]:
data = pd.read_excel(r"C:\Users\BLESSING\Documents\esri\final_project\utilization_v2.xlsx")
ds.structdata.describe(data)

First five data points


Unnamed: 0,Enrolee,Gender,Marital Status,Age,Policy Start Date,Policy End Date,Base Premium,Smokes,Work Industry,Pre-existing Condition,...,Stomach Ulcer,No. of Hospital Visits,Consultation Charges (GP),Consultation Charges (SP),Cost of Drugs,Cost of Lab Services,No. of Lab Visits,No. of GP Visit,No. of SP Visit,Utilization
0,1,Male,Married,43,2023-01-01 00:00:00,2023-12-31 00:00:00,4907.9,No,Education,No,...,0,6.0,356.4,1442,2233.31,1137.25,2.0,3.0,5.0,6702.84
1,2,Male,Married,35,2023-01-01 00:00:00,2023-12-31 00:00:00,4907.9,No,Education,No,...,0,4.0,275.4,251,219.08,921.0,1.0,2.0,1.0,2754.48
2,3,Male,Single,16,2023-01-01 00:00:00,2023-12-31 00:00:00,4907.9,No,Education,No,...,0,3.0,,417,,,1.0,1.0,2.0,1112.0
3,4,Male,Married,54,2023-01-01 00:00:00,2023-12-31 00:00:00,4907.9,No,Education,Yes,...,0,25.0,1132.7,840,3241.21,905.0,5.0,11.0,5.0,6877.91
4,5,Male,Single,39,2023-01-01 00:00:00,2023-12-31 00:00:00,4907.9,No,Education,No,...,0,5.0,180.0,262,,1330.62,1.0,1.0,2.0,3686.62




Random five data points


Unnamed: 0,Enrolee,Gender,Marital Status,Age,Policy Start Date,Policy End Date,Base Premium,Smokes,Work Industry,Pre-existing Condition,...,Stomach Ulcer,No. of Hospital Visits,Consultation Charges (GP),Consultation Charges (SP),Cost of Drugs,Cost of Lab Services,No. of Lab Visits,No. of GP Visit,No. of SP Visit,Utilization
2482,2483,Female,Married,41,2023-02-01 00:00:00,2024-01-31 00:00:00,4344.78,No,Oil & Gas,Yes,...,0,,,,,,,,,
1319,1320,Male,Married,41,16/02/2023,15/05/2024,5028.66,No,NGO,No,...,0,,,,,,,,,
846,847,Male,Single,14,2023-06-01 00:00:00,2024-05-30 00:00:00,2856.0,No,Manufacturing,No,...,1,1.0,120.0,,1046.58,405.0,1.0,1.0,1.0,1571.58
1081,1082,Female,Single,30,2023-02-01 00:00:00,2024-01-31 00:00:00,2373.53,No,Healthcare,No,...,0,13.0,150.0,1060.0,3088.9,845.0,6.0,1.0,7.0,9078.9
638,639,Female,Married,49,2023-01-01 00:00:00,2023-12-31 00:00:00,3546.75,No,Manufacturing,No,...,0,23.0,1400.0,100.0,2580.44,923.0,6.0,20.0,1.0,5888.44




Last five data points


Unnamed: 0,Enrolee,Gender,Marital Status,Age,Policy Start Date,Policy End Date,Base Premium,Smokes,Work Industry,Pre-existing Condition,...,Stomach Ulcer,No. of Hospital Visits,Consultation Charges (GP),Consultation Charges (SP),Cost of Drugs,Cost of Lab Services,No. of Lab Visits,No. of GP Visit,No. of SP Visit,Utilization
2495,2496,Female,Single,27,2023-10-01 00:00:00,2024-09-29 00:00:00,1155.0,No,Education,No,...,0,2.0,,,261.5,,1.0,1.0,1.0,581.5
2496,2497,Female,Single,27,2023-10-01 00:00:00,2024-09-29 00:00:00,1155.0,No,Education,No,...,1,4.0,,240.0,186.2,358.0,2.0,1.0,2.0,1024.2
2497,2498,Female,Single,43,2023-10-01 00:00:00,2024-09-29 00:00:00,1155.0,No,Education,No,...,0,,,,,,,,,
2498,2499,Female,Single,22,2023-10-01 00:00:00,2024-09-29 00:00:00,1155.0,No,Education,No,...,0,2.0,40.0,,165.0,,1.0,1.0,1.0,525.0
2499,2500,Female,Single,29,2023-10-01 00:00:00,2024-09-29 00:00:00,1155.0,No,Education,No,...,0,,,,,,,,,




Shape of  data set: (2500, 26)


Size of  data set: 65000


Data Types
Note: All Non-numerical features are identified as objects in pandas


Unnamed: 0,Data Type
Enrolee,int64
Gender,object
Marital Status,object
Age,int64
Policy Start Date,object
Policy End Date,object
Base Premium,float64
Smokes,object
Work Industry,object
Pre-existing Condition,object




Numerical Features in Data set
['Enrolee', 'Age', ' Base Premium ', ' Premium after Risk Loading ', 'Hypertention', 'Diabetes', 'Dyslipidaemia/ Hyperlipidaemia', 'Refractive Error', 'Spondylosis', 'Stomach Ulcer', 'No. of Hospital Visits', ' Cost of Drugs ', 'Cost of Lab Services ', 'No. of Lab Visits', 'No. of GP Visit', 'No. of SP Visit', ' Utilization ']


Categorical Features in Data set


['Gender',
 'Marital Status',
 'Policy Start Date',
 'Policy End Date',
 'Smokes',
 'Work Industry',
 'Pre-existing Condition',
 ' Consultation Charges (GP) ',
 ' Consultation Charges (SP) ']



Statistical Description of Columns


Unnamed: 0,Enrolee,Age,Base Premium,Premium after Risk Loading,Hypertention,Diabetes,Dyslipidaemia/ Hyperlipidaemia,Refractive Error,Spondylosis,Stomach Ulcer,No. of Hospital Visits,Cost of Drugs,Cost of Lab Services,No. of Lab Visits,No. of GP Visit,No. of SP Visit,Utilization
count,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,1629.0,1505.0,1300.0,1610.0,1610.0,1610.0,1629.0
mean,1250.5,35.5524,4246.607996,4501.59824,0.372,0.1392,0.2356,0.4792,0.1156,0.1912,7.721301,1812.782133,764.795638,2.614286,3.52236,2.001863,3869.887649
std,721.83216,13.835484,955.073738,1096.152246,0.483435,0.346224,0.424458,0.499667,0.319809,0.393325,6.408756,2537.000656,737.842924,2.194897,3.344308,2.086238,4300.860751
min,1.0,0.0,1150.0,1150.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,20.0,1.0,1.0,1.0,90.0
25%,625.75,29.75,3546.75,3546.75,0.0,0.0,0.0,0.0,0.0,0.0,3.0,400.0,270.0,1.0,1.0,1.0,1245.0
50%,1250.5,37.0,4344.78,5028.66,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1004.5,534.425,2.0,2.0,1.0,2547.17
75%,1875.25,45.0,5028.66,5344.78,1.0,0.0,0.0,1.0,0.0,0.0,11.0,2219.0,1011.25,3.0,5.0,2.0,5270.3
max,2500.0,70.0,15070.69,15070.69,1.0,1.0,1.0,1.0,1.0,1.0,47.0,48770.23,7747.0,19.0,30.0,16.0,70383.44




Description of Categorical Features


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations