In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
import joblib
import statsmodels.api as sm

In [2]:
data = pd.read_csv('insurance.csv')
data = data.dropna()
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [3]:
data["sex"] = data["sex"].map({"male": 0, "female": 1})
data['smoker'] = data["smoker"].map({"no": 0, "yes": 1})
scaler = StandardScaler()
data["charges"] = np.log(data["charges"])
scaler = StandardScaler()
data[['age', 'bmi']] = scaler.fit_transform(data[['age', 'bmi']])
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,-1.438764,1,-0.45332,0,1,9.734176
1,-1.509965,0,0.509621,1,0,7.453302
2,-0.797954,0,0.383307,3,0,8.400538
3,-0.441948,0,-1.305531,0,0,9.998092
4,-0.513149,0,-0.292556,0,0,8.260197


In [6]:
X = data.drop('charges', axis=1)
y = data['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [7]:
kmeans = KMeans(n_clusters=4, random_state=42)
X_train['cluster'] = kmeans.fit_predict(X_train)

X_test['cluster'] = kmeans.predict(X_test)

In [8]:
RFG = RandomForestRegressor(n_estimators=100, random_state=42)
RFG.fit(X_train, y_train)

LR = LinearRegression()
LR.fit(X_train, y_train)

DTR = DecisionTreeRegressor()
DTR.fit(X_train, y_train)

GBR = GradientBoostingRegressor()
GBR.fit(X_train, y_train)

GPR = GaussianProcessRegressor()
GPR.fit(X_train, y_train)

In [13]:
y_pred_RFG = RFG.predict(X_test)
y_pred_LR = LR.predict(X_test)
y_pred_DTR = DTR.predict(X_test)
y_pred_GBR = GBR.predict(X_test)
y_pred_GPR = GPR.predict(X_test)

print('RandomForestRegressor:', mean_squared_error(y_test, y_pred_RFG))
print('LinearRegression:', mean_squared_error(y_test, y_pred_LR))
print('DecisionTreeRegressor:', mean_squared_error(y_test, y_pred_DTR))
print('GradientBoostingRegressor:', mean_squared_error(y_test, y_pred_GBR))
print('GaussianProcessRegressor:', mean_squared_error(y_test, y_pred_GPR))

print()

print('RandomForestRegressor R²:', r2_score(y_test, y_pred_RFG))
print('LinearRegression R²:', r2_score(y_test, y_pred_LR))
print('DecisionTreeRegressor R²:', r2_score(y_test, y_pred_DTR))
print('GradientBoostingRegressor R²:', r2_score(y_test, y_pred_GBR))
print('GaussianProcessRegressor R²:', r2_score(y_test, y_pred_GPR))


RandomForestRegressor: 0.17262432340634873
LinearRegression: 0.1831912524862502
DecisionTreeRegressor: 0.2940067298050001
GradientBoostingRegressor: 0.12267089141388508
GaussianProcessRegressor: 2484.8408662820702

RandomForestRegressor R²: 0.8006945066251284
LinearRegression R²: 0.7884943312838498
DecisionTreeRegressor R²: 0.6605509861933916
GradientBoostingRegressor R²: 0.8583688436627321
GaussianProcessRegressor R²: -2867.9029740415913


array([0.39299227, 0.00498757, 0.05505171, 0.04091951, 0.50412004,
       0.0019289 ])

In [14]:
pred_results = pd.DataFrame()
pred_results['Actual'] = y_test
pred_results['RandomForestRegressor'] = y_pred_RFG
pred_results['LinearRegression'] = y_pred_LR
pred_results['DecisionTreeRegressor'] = y_pred_DTR
pred_results['GradientBoostingRegressor'] = y_pred_GBR
pred_results['GaussianProcessRegressor'] = y_pred_GPR

pred_results.head()

Unnamed: 0,Actual,RandomForestRegressor,LinearRegression,DecisionTreeRegressor,GradientBoostingRegressor,GaussianProcessRegressor
764,9.115488,9.150711,9.056752,9.060039,9.189983,12.261429
887,8.570198,8.492836,8.614607,8.606852,8.632827,33.06033
890,10.2864,10.25376,11.122873,10.273342,10.231925,9.971445
1293,9.137973,9.156843,9.130211,9.159107,9.132984,9.96836
259,10.426744,10.445094,9.553678,10.426223,10.36941,9.923532


In [9]:
new_data=pd.DataFrame({
    'age':[19],
    'sex':[1],
    'bmi':[27.9],
    'children':[0],
    'smoker':[0],
})

new_data[['age', 'bmi']] = scaler.transform(new_data[['age', 'bmi']])
kmeans.predict(new_data)
new_data['cluster'] = kmeans.predict(new_data)
print(GBR.predict(np.exp(new_data)))

[10.61978402]


In [36]:
testData =pd.DataFrame(
    {
        'age': [32],
        'sex': [1],
        'bmi': [33],
        'children': [2],
        'smoker': [1],
    }
)

testData[['age', 'bmi']] = scaler.transform(testData[['age', 'bmi']])
kmeans.predict(testData)
testData['cluster'] = kmeans.predict(testData)
results = GBR.predict(testData)
print(np.exp(results))

[38658.95861272]


In [10]:
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(kmeans, 'kmeans.pkl')
joblib.dump(GBR, 'gbr_model.pkl')

['gbr_model.pkl']

In [3]:
def calCatAge(catAge):
    if catAge == 0:
        return 0
    elif catAge == 1:
        return 15
    elif catAge == 2:
        return 24
    elif catAge >= 3:
        age = 24
        for x in range (3,catAge+1):
            age += 4
        return age

calCatAge(3)
            

28