# 岭回归-根据多因素预测医疗费用


In [45]:
# 导入相关库
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd 
import seaborn as sns

In [9]:
# 导入数据
data=pd.read_csv("./data/insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:
# 数据预处理
data.info()
#观察可知没有缺失数据

#采用独热码构造哑变量
data=pd.get_dummies(data,drop_first=True)
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [11]:
# 分离自变量和因变量
X=data.drop(["charges"],axis=1)
Y=data.loc[:,"charges"]
Y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [76]:
# 分离训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)


In [77]:
# 特征缩放
# scX=StandardScaler()
# scY=StandardScaler()
# x_train=scX.fit_transform(x_train)
# x_test=scX.transform(x_test)
# y_train=np.ravel(scY.fit_transform(y_train.values.reshape(-1,1)))

## sklearn.linear_model.Ridge 方法

``` python
sklearn.linear_model.Ridge(alpha=1.0, fit_intercept=True,
                  normalize=False,copy_X=True, 
                  max_iter=None, tol=0.001, 
                  solver="auto",random_state=None)
```

[api详解](https://blog.csdn.net/TeFuirnever/article/details/100578650)

参数详解
> alpha：正则化系数，float类型，默认为1.0。正则化改善了问题的条件并减少了估计的方差。较大的值指定较强的正则化。
>
> fit_intercept：是否需要截距，bool类型，默认为True。也就是是否求解b。
>
> normalize：是否先进行归一化，bool类型，默认为False。如果为真，则回归X将在回归之前被归一化。 **已经被废弃的参数** 当fit_intercept设置为False时，将忽略此参数。 当回归量归一化时，注意到这使得超参数学习更加鲁棒，并且几乎不依赖于样本的数量。 相同的属性对标准化数据无效。然而，如果你想标准化，请在调用normalize = False训练估计器之前，使用preprocessing.StandardScaler处理数据。
>
> copy_X：是否复制X数组，bool类型，默认为True，如果为True，将复制X数组; 否则，它覆盖原数组X。
>
> max_iter：最大的迭代次数，int类型，默认为None，最大的迭代次数，对于sparse_cg和lsqr而言，默认次数取决于scipy.sparse.linalg，对于sag而言，则默认为1000次。
>
> tol：精度，float类型，默认为0.001。就是解的精度。
>
> solver：求解方法，str类型，默认为auto。可选参数为：auto、svd、cholesky、lsqr、sparse_cg、sag。
> - auto根据数据类型自动选择求解器。
> - svd使用X的奇异值分解来计算Ridge系数。对于奇异矩阵比cholesky更稳定。
> - cholesky使用标准的scipy.linalg.solve函数来获得闭合形式的解。
> - sparse_cg使用在scipy.sparse.linalg.cg中找到的共轭梯度求解器。作为迭代算法，这个求解器比大规模数据（设置tol和max_iter的可能性）的cholesky更合适。
> - lsqr使用专用的正则化最小二乘常数scipy.sparse.linalg.lsqr。它是最快的，但可能在旧的scipy版本不可用。它是使用迭代过程。
> - sag使用随机平均梯度下降。它也使用迭代过程，并且当n_samples和n_feature都很大时，通常比其他求解器更快。注意，sag快速收敛仅在具有近似相同尺度的特征上被保证。您可以使用sklearn.preprocessing的缩放器预处理数据。
>
> random_state：sag的伪随机种子。



In [53]:
? Ridge

## 改变超参数alpha 对比其对模型的影响

In [78]:
#alpha=10
print("alpha=10")
reg=Ridge(alpha=10, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=10
r2评分 0.13531841086523477
MSE： 129517393.94808142
Y=26.00*age + 35.49*bmi + 48.82*children + 41.62*sex_male + 2153.66*smoker_yes + -78.10*region_northwest + 154.08*region_southeast + -78.58*region_southwest + 10508.510489227036


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [79]:
print("alpha=1")
reg=Ridge(alpha=1, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=1
r2评分 0.5805838487927986
MSE： 62822763.392531164
Y=137.74*age + 174.14*bmi + 250.13*children + 39.73*sex_male + 11826.83*smoker_yes + -173.58*region_northwest + 221.34*region_southeast + -347.34*region_southwest + -134.66484351204235


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [80]:
print("alpha=0.1")
reg=Ridge(alpha=0.1, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)


alpha=0.1
r2评分 0.7662106615700421
MSE： 35018423.2285937
Y=240.89*age + 300.29*bmi + 424.36*children + -293.28*sex_male + 21578.48*smoker_yes + -292.81*region_northwest + -783.59*region_southeast + -841.36*region_southwest + -9520.548136698832


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [81]:
print("alpha=0.01")
reg=Ridge(alpha=0.01, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=0.01
r2评分 0.772637700645907
MSE： 34055741.28605177
Y=260.11*age + 327.00*bmi + 458.40*children + -403.42*sex_male + 23532.82*smoker_yes + -438.93*region_northwest + -1208.32*region_southeast + -1082.84*region_southwest + -11237.729184035736


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [82]:
print("alpha=0.001")
reg=Ridge(alpha=0.001, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=0.001
r2评分 0.772704767623783
MSE： 34045695.57638983
Y=262.20*age + 330.03*bmi + 462.24*children + -416.53*sex_male + 23748.31*smoker_yes + -461.38*region_northwest + -1263.54*region_southeast + -1116.25*region_southwest + -11422.531196408327


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


## 对比alpha=10、1、0.1、0.01、0.001的结果，发现不同的超参数alpha会对模型预测的准确度产生不同的影响