In [None]:
import numpy as np
import pandas as pd
data = pd.read_csv("insurance.csv")
data.head()

EDA数据探索

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(data['charges'])
plt.show()
#线性回归 前提是假设y成正态分布
plt.hist(np.log(data['charges']))

In [None]:
#进阶探索
import seaborn as sns
sns.kdeplot(data.loc[data.sex=='male','charges'], shade = True, label = 'male')
sns.kdeplot(data.loc[data.sex=='female', 'charges'], shade = True, label = 'female')

In [None]:
sns.kdeplot(data.loc[data.region == 'northwest', 'charges'], shade = True, label = 'northwest')
sns.kdeplot(data.loc[data.region == 'southwest', 'charges'], shade = True, label = 'southwest')
sns.kdeplot(data.loc[data.region == 'northeast', 'charges'], shade = True, label = 'northeast')
sns.kdeplot(data.loc[data.region == 'southeast', 'charges'], shade = True, label = 'southeast')

In [None]:
sns.kdeplot(data.loc[data.smoker=='yes', 'charges'], shade = True, label = 'smoker yes')
sns.kdeplot(data.loc[data.smoker=='no', 'charges'], shade = True, label = 'smoker no')


In [None]:
sns.kdeplot(data.loc[data.children==0, 'charges'], shade=True, label = 'children 0')
sns.kdeplot(data.loc[data.children==1, 'charges'], shade=True, label = 'children 1')
sns.kdeplot(data.loc[data.children==2, 'charges'], shade=True, label = 'children 2')
sns.kdeplot(data.loc[data.children==3, 'charges'], shade=True, label = 'children 3')
sns.kdeplot(data.loc[data.children==4, 'charges'], shade=True, label = 'children 4')
sns.kdeplot(data.loc[data.children==5, 'charges'], shade=True, label = 'children 5')


特征工程

In [None]:
#对y影响不大的维度去掉
data = data.drop(['region', 'sex'], axis=1)
data.head()

In [None]:
#针对BMI与children这一列 把连续值变成离散值
def greater(df, bmi, num_child):
    df['bmi'] = 'over' if df['bmi'] >= bmi else 'under'
    df['children'] = 'no' if df['children'] == num_child else 'yes'
    return df

data = data.apply(greater, axis=1, args=(30, 0))
data


In [None]:
#one-hot编码
data = pd.get_dummies(data)
data

In [None]:
x =data.drop('charges', axis=1)
y = data['charges']
x.fillna(0, inplace=True)
y.fillna(0, inplace=True)
x.head()

模型训练

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

from sklearn.preprocessing import PolynomialFeatures
Poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = Poly_features.fit_transform(x_train)
x_test_poly = Poly_features.fit_transform(x_test)


In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor

reg = LinearRegression()
ridge = Ridge()
booster = GradientBoostingRegressor()

In [None]:
reg.fit(x_train_poly, np.log1p(y_train))
ridge.fit(x_train_poly, np.log1p(y_train))
booster.fit(x_train_poly, np.log1p(y_train))

In [None]:
y_predict = reg.predict(x_test_poly)
y_predict_ridge = ridge.predict(x_test_poly)
y_predict_booster = booster.predict(x_test_poly)


模型评估

In [None]:
from sklearn.metrics import mean_squared_error
log_rmse_train = np.sqrt(mean_squared_error(y_true=np.log1p(y_train), y_pred=reg.predict(x_train_poly)))
log_rmse_test = np.sqrt(mean_squared_error(y_true=np.log1p(y_test), y_pred=y_predict))
rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=np.exp(reg.predict(x_train_poly))))
rmse_test = np.sqrt(mean_squared_error(y_true=y_test, y_pred=np.exp(reg.predict(x_test_poly))))
log_rmse_train, log_rmse_test, rmse_train, rmse_test

In [None]:
from sklearn.metrics import mean_squared_error
log_rmse_train = np.sqrt(mean_squared_error(y_true=np.log1p(y_train), y_pred=ridge.predict(x_train_poly)))
log_rmse_test = np.sqrt(mean_squared_error(y_true=np.log1p(y_test), y_pred=y_predict_ridge))
rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=np.exp(ridge.predict(x_train_poly))))
rmse_test = np.sqrt(mean_squared_error(y_true=y_test, y_pred=np.exp(ridge.predict(x_test_poly))))
log_rmse_train, log_rmse_test, rmse_train, rmse_test

In [None]:
from sklearn.metrics import mean_squared_error
log_rmse_train = np.sqrt(mean_squared_error(y_true=np.log1p(y_train), y_pred=booster.predict(x_train_poly)))
log_rmse_test = np.sqrt(mean_squared_error(y_true=np.log1p(y_test), y_pred=y_predict_booster))
rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=np.exp(booster.predict(x_train_poly))))
rmse_test = np.sqrt(mean_squared_error(y_true=y_test, y_pred=np.exp(booster.predict(x_test_poly))))
log_rmse_train, log_rmse_test, rmse_train, rmse_test