# Lasso回归-根据多因素预测医疗费用

In [2]:
# 导入相关库
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
import numpy as np
import pandas as pd 
import seaborn as sns

In [3]:
# 导入数据
data=pd.read_csv("./data/insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
# 数据预处理
data.info()
#观察可知没有缺失数据

#采用独热码构造哑变量
data=pd.get_dummies(data,drop_first=True)
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [5]:
# 分离自变量和因变量
X=data.drop(["charges"],axis=1)
Y=data.loc[:,"charges"]
Y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [6]:
# 分离训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)

In [7]:
# 特征缩放
# scX=StandardScaler()
# scY=StandardScaler()
# x_train=scX.fit_transform(x_train)
# x_test=scX.transform(x_test)
# y_train=np.ravel(scY.fit_transform(y_train.values.reshape(-1,1)))

## sklearn.linear_model.Lasso 方法

``` python
sklearn.linear_model.Lasso(alpha=1.0, fit_intercept=True, normalize=False, 
                           precompute=False, copy_X=True, max_iter=1000, 
                           tol=1e-4, warm_start=False, positive=False, random_state=None, selection=’cyclic’)

```

[api详解](https://blog.csdn.net/TeFuirnever/article/details/100578650)

参数详解
> alpha：正则化系数，float类型，默认为1.0。正则化改善了问题的条件并减少了估计的方差。较大的值指定较强的正则化。
>
> fit_intercept：是否需要截距，bool类型，默认为True。也就是是否求解b。
>
> normalize：是否先进行归一化，bool类型，默认为False。如果为真，则回归X将在回归之前被归一化。 **已经被废弃的参数** 当fit_intercept设置为False时，将忽略此参数。 当回归量归一化时，注意到这使得超参数学习更加鲁棒，并且几乎不依赖于样本的数量。 相同的属性对标准化数据无效。然而，如果你想标准化，请在调用normalize = False训练估计器之前，使用preprocessing.StandardScaler处理数据。
>


## 改变超参数alpha 对比其对模型的影响

In [9]:
#alpha=10
print("alpha=10")
reg=Lasso(alpha=10, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=10
r2评分 0.7517555090530597
MSE： 33932892.26082331
Y=228.01*age + 271.94*bmi + 267.25*children + 0.00*sex_male + 23030.83*smoker_yes + -0.00*region_northwest + -0.00*region_southeast + -0.00*region_southwest + -8868.814734428033


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [11]:
print("alpha=1")
reg=Lasso(alpha=1, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=1
r2评分 0.7561723532416841
MSE： 33329147.551670946
Y=244.01*age + 320.12*bmi + 481.65*children + 28.07*sex_male + 23707.83*smoker_yes + -652.89*region_northwest + -846.98*region_southeast + -956.95*region_southwest + -10742.44922209232


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [12]:
print("alpha=0.1")
reg=Lasso(alpha=0.1, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)


alpha=0.1
r2评分 0.7559320349249938
MSE： 33361997.003908318
Y=245.46*age + 326.05*bmi + 502.06*children + 76.61*sex_male + 23771.39*smoker_yes + -849.03*region_northwest + -1065.35*region_southeast + -1158.29*region_southwest + -10884.35080210587


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [13]:
print("alpha=0.01")
reg=Lasso(alpha=0.01, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=0.01
r2评分 0.7558999426591778
MSE： 33366383.741330717
Y=245.60*age + 326.64*bmi + 504.11*children + 81.47*sex_male + 23777.74*smoker_yes + -868.70*region_northwest + -1087.23*region_southeast + -1178.45*region_southwest + -10898.526302665643


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [14]:
print("alpha=0.001")
reg=Lasso(alpha=0.001, normalize = True, fit_intercept = True)
reg.fit(x_train,y_train)
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
print("r2评分",r2_score(y_test,y_pre))
print("MSE：",mean_squared_error(y_test, y_pre))
# 获取模型表达式
print("Y=",end="")
for i in range(len(X.columns)):
    print("{0:.2f}*{1} + ".format(reg.coef_[i],X.columns[i]),end="")
print(reg.intercept_)

alpha=0.001
r2评分 0.7558966501986559
MSE： 33366833.792438556
Y=245.62*age + 326.70*bmi + 504.31*children + 81.95*sex_male + 23778.38*smoker_yes + -870.66*region_northwest + -1089.42*region_southeast + -1180.47*region_southwest + -10899.943452831747


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


## 对比alpha=10、1、0.1、0.01、0.001的结果，同样发现不同的超参数alpha会对模型预测的准确度产生不同的影响