# 岭回归-根据多因素预测医疗费用


In [45]:
# 导入相关库
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd 
import seaborn as sns

In [9]:
# 导入数据
data=pd.read_csv("./data/insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:
# 数据预处理
data.info()
#观察可知没有缺失数据

#采用独热码构造哑变量
data=pd.get_dummies(data,drop_first=True)
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [11]:
# 分离自变量和因变量
X=data.drop(["charges"],axis=1)
Y=data.loc[:,"charges"]
Y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [51]:
# 分离训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)


In [52]:
# 特征缩放
# scX=StandardScaler()
# scY=StandardScaler()
# x_train=scX.fit_transform(x_train)
# x_test=scX.transform(x_test)
# y_train=np.ravel(scY.fit_transform(y_train.values.reshape(-1,1)))

## sklearn.linear_model.Ridge 方法

``` python
sklearn.linear_model.Ridge(alpha=1.0, fit_intercept=True,
                  normalize=False,copy_X=True, 
                  max_iter=None, tol=0.001, 
                  solver="auto",random_state=None)
```

[api详解](https://blog.csdn.net/TeFuirnever/article/details/100578650)

参数详解
> alpha：正则化系数，float类型，默认为1.0。正则化改善了问题的条件并减少了估计的方差。较大的值指定较强的正则化。
>
> fit_intercept：是否需要截距，bool类型，默认为True。也就是是否求解b。
>
> normalize：是否先进行归一化，bool类型，默认为False。如果为真，则回归X将在回归之前被归一化。 **已经被废弃的参数** 当fit_intercept设置为False时，将忽略此参数。 当回归量归一化时，注意到这使得超参数学习更加鲁棒，并且几乎不依赖于样本的数量。 相同的属性对标准化数据无效。然而，如果你想标准化，请在调用normalize = False训练估计器之前，使用preprocessing.StandardScaler处理数据。
>
> copy_X：是否复制X数组，bool类型，默认为True，如果为True，将复制X数组; 否则，它覆盖原数组X。
>
> max_iter：最大的迭代次数，int类型，默认为None，最大的迭代次数，对于sparse_cg和lsqr而言，默认次数取决于scipy.sparse.linalg，对于sag而言，则默认为1000次。
>
> tol：精度，float类型，默认为0.001。就是解的精度。
>
> solver：求解方法，str类型，默认为auto。可选参数为：auto、svd、cholesky、lsqr、sparse_cg、sag。
> - auto根据数据类型自动选择求解器。
> - svd使用X的奇异值分解来计算Ridge系数。对于奇异矩阵比cholesky更稳定。
> - cholesky使用标准的scipy.linalg.solve函数来获得闭合形式的解。
> - sparse_cg使用在scipy.sparse.linalg.cg中找到的共轭梯度求解器。作为迭代算法，这个求解器比大规模数据（设置tol和max_iter的可能性）的cholesky更合适。
> - lsqr使用专用的正则化最小二乘常数scipy.sparse.linalg.lsqr。它是最快的，但可能在旧的scipy版本不可用。它是使用迭代过程。
> - sag使用随机平均梯度下降。它也使用迭代过程，并且当n_samples和n_feature都很大时，通常比其他求解器更快。注意，sag快速收敛仅在具有近似相同尺度的特征上被保证。您可以使用sklearn.preprocessing的缩放器预处理数据。
>
> random_state：sag的伪随机种子。



In [53]:
? Ridge

## 改变超参数alpha 对比其对模型的影响

In [60]:
#alpha=10
print("alpha=10")
reg=Ridge(alpha=10, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=10
r2评分 0.13194347043980448
MSE： 123755193.15624689
Y=22.64*age + 41.59*bmi + 42.73*children + 108.21*sex_male + 2133.46*smoker_yes + -56.15*region_northwest + 164.83*region_southeast + -153.64*region_southwest + 10725.366617506985


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [61]:
print("alpha=1")
reg=Ridge(alpha=1, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=1
r2评分 0.569738946664595
MSE： 61340520.97979278
Y=124.02*age + 205.76*bmi + 245.51*children + 254.95*sex_male + 11742.01*smoker_yes + -66.00*region_northwest + 252.79*region_southeast + -609.79*region_southwest + -447.54938094380304


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [62]:
print("alpha=0.1")
reg=Ridge(alpha=0.1, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)


alpha=0.1
r2评分 0.7586897940033164
MSE： 34402588.00751572
Y=224.01*age + 350.51*bmi + 465.97*children + -149.29*sex_male + 21512.59*smoker_yes + -19.21*region_northwest + -630.16*region_southeast + -956.94*region_southwest + -10523.329173537537


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [63]:
print("alpha=0.01")
reg=Ridge(alpha=0.01, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=0.01
r2评分 0.7664955713359761
MSE： 33289750.941446558
Y=243.38*age + 379.13*bmi + 512.89*children + -297.21*sex_male + 23481.70*smoker_yes + -78.89*region_northwest + -966.33*region_southeast + -1085.42*region_southwest + -12401.250130087168


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [64]:
print("alpha=0.001")
reg=Ridge(alpha=0.001, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=0.001
r2评分 0.7667364171367959
MSE： 33255414.561745085
Y=245.49*age + 382.32*bmi + 518.15*children + -314.91*sex_male + 23698.95*smoker_yes + -89.08*region_northwest + -1008.65*region_southeast + -1103.00*region_southwest + -12604.573352457644


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


## 对比alpha=10、1、0.1、0.01、0.001的结果，发现不同的超参数alpha会对模型预测的准确度产生不同的影响