In [66]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge,LogisticRegression
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,classification_report,roc_auc_score
import joblib

## 1.	决策树练习并调参，从而提升准确率，随机森林练习，结合网格搜索调参，找到最佳参数模型

In [2]:
# 获取数据
data = pd.read_csv("../data/titanic.txt")

In [5]:
# 查看数据信息，大致掌握数据格式
print(data.shape)
print(data.head())

(1313, 11)
   row.names pclass  survived  \
0          1    1st         1   
1          2    1st         0   
2          3    1st         0   
3          4    1st         0   
4          5    1st         1   

                                              name      age     embarked  \
0                     Allen, Miss Elisabeth Walton  29.0000  Southampton   
1                      Allison, Miss Helen Loraine   2.0000  Southampton   
2              Allison, Mr Hudson Joshua Creighton  30.0000  Southampton   
3  Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)  25.0000  Southampton   
4                    Allison, Master Hudson Trevor   0.9167  Southampton   

                         home.dest room      ticket   boat     sex  
0                     St Louis, MO  B-5  24160 L221      2  female  
1  Montreal, PQ / Chesterville, ON  C26         NaN    NaN  female  
2  Montreal, PQ / Chesterville, ON  C26         NaN  (135)    male  
3  Montreal, PQ / Chesterville, ON  C26         NaN    Na

In [6]:
# 数据处理，找出有用的特征,和目标
x = data[['pclass','age','sex']] # 特征包括阶级，年龄和性别
y = data['survived']  # 目标是是否存活

In [12]:
print(x.info()) # 可以看出age的数量比其他两个少，说明存在空值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  1313 non-null   object 
 1   age     633 non-null    float64
 2   sex     1313 non-null   object 
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
None


In [13]:
# 对空值用均值进行替换
x.age.fillna(x.age.mean(),inplace=True)
print(x.info())  # 已经没有空值了

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  1313 non-null   object 
 1   age     1313 non-null   float64
 2   sex     1313 non-null   object 
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
None


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.age.fillna(x.age.mean(),inplace=True)


In [15]:
print(y)

0       1
1       0
2       0
3       0
4       1
       ..
1308    0
1309    0
1310    0
1311    0
1312    0
Name: survived, Length: 1313, dtype: int64


In [17]:
# 进行训练集和测试集的划分
x_train,x_test,y_train,y_test = train_test_split(x,y,
    test_size=0.25,random_state=2)

In [21]:
# 查看数据集情况
print(x_train.shape)
print(x_train.head())
print('-'* 30)
print(y_train.shape)
print(y_train.head())
print('-' * 30)
print(x_test.shape)
print(x_test.head())
print(y_test.shape)

(984, 3)
     pclass        age     sex
884     3rd  31.194181  female
944     3rd  31.194181    male
967     3rd  31.194181  female
1092    3rd  31.194181    male
1199    3rd  31.194181    male
------------------------------
(984,)
884     1
944     1
967     0
1092    1
1199    1
Name: survived, dtype: int64
------------------------------
(329, 3)
     pclass        age   sex
811     3rd  31.194181  male
681     3rd  40.000000  male
757     3rd  17.000000  male
1223    3rd  31.194181  male
846     3rd  31.194181  male
(329,)


In [22]:
# 机器学习只能处理数值型数据
# 训练和测试集样本中存在着类别
# 将类别用one-hot编码代替
# orient = record表示转换转换成字典形式，字典用列表表示[{}]
x_train = x_train.to_dict(orient='record')
x_test = x_test.to_dict(orient='record')
print(type(x_train))
print(type(x_test))

<class 'list'>
<class 'list'>


  x_train = x_train.to_dict(orient='record')
  x_test = x_test.to_dict(orient='record')


In [23]:
# 对字典特征进行提取
transer = DictVectorizer(sparse=False)
x_train = transer.fit_transform(x_train)
x_test = transer.transform(x_test)

In [25]:
print(x_train.shape)
print(x_test.shape)

(984, 6)
(329, 6)


In [73]:
# 利用决策树进行模型的训练与评估
# decision_tree = DecisionTreeClassifier() # 准确率为: 0.7963525835866262
# decision_tree = DecisionTreeClassifier(max_depth=10) # 准确率为: 0.8024316109422492
# decision_tree = DecisionTreeClassifier(max_depth=9)  # 准确率为: 0.8115501519756839
# decision_tree = DecisionTreeClassifier(max_depth=7)  # 准确率为: 0.817629179331307
# decision_tree = DecisionTreeClassifier(max_depth=6)  # 准确率为: 0.8267477203647416
decision_tree = DecisionTreeClassifier(max_depth=6,min_samples_leaf=5)  # 准确率为: 0.8328267477203647
decision_tree.fit(x_train,y_train)
score = decision_tree.score(x_test,y_test)
print(f'准确率为: {score}')

准确率为: 0.8328267477203647


In [71]:
# 决策树可视化
export_graphviz(decision_tree,out_file='tree1.dot',
                feature_names=transer.get_feature_names_out())

#### 随机森林

In [91]:
# 设置n_jobs可以利用多核
random_forest = RandomForestClassifier(n_jobs=-1)
# 设置超参数列表
# params = {"n_estimators": [1500, 2000,2500,3000,3500],
#           "max_depth": [3,5,6,9,15]} # 准确率为：0.8115501519756839
params = {"n_estimators": [200, 5000,1000,2000,5000],
          "max_depth": [2,3,5,6]}  # 准确率为：0.8115501519756839
params = {"n_estimators": [1800, 2000,2200,2300],
          "max_depth": [2,3,5,6,9],
          "min_samples_leaf":[2,3,4,5,6]} # 准确率为：0.8115501519756839

In [92]:
# 网格搜索与交叉验证
grid = GridSearchCV(random_forest,param_grid=params,cv=3)

In [93]:
# 开始训练
grid.fit(x_train,y_train)

In [94]:
# 模型评估
score = grid.score(x_test,y_test)
print(f'准确率为：{score}')

准确率为：0.8115501519756839


In [95]:
print(grid.best_score_)
print(grid.best_estimator_)

0.8292682926829268
RandomForestClassifier(max_depth=3, min_samples_leaf=2, n_estimators=1800,
                       n_jobs=-1)


In [96]:
print(grid.cv_results_)

{'mean_fit_time': array([1.05579766, 1.06435768, 1.2496082 , 1.230769  , 0.91090512,
       0.98815346, 1.27143343, 1.18744628, 0.909597  , 1.00792011,
       1.14660859, 1.29155493, 0.93133839, 1.06226325, 1.19724671,
       1.38422251, 1.01091218, 1.09328572, 1.16757313, 1.28598817,
       1.12477771, 1.0944488 , 1.23365943, 1.96006902, 2.25561579,
       1.69876933, 1.69054278, 1.79673934, 1.22642851, 1.37750729,
       1.30312459, 1.37959202, 1.07421231, 1.15625612, 1.32986641,
       1.27658168, 1.00412901, 1.03713266, 1.13420335, 1.29708139,
       1.01421054, 1.09885955, 1.21740675, 1.61272963, 1.06817166,
       1.1749324 , 1.27821541, 1.36108573, 0.97132333, 1.04906638,
       1.1572717 , 1.20429333, 1.0599854 , 1.0507253 , 1.18074878,
       1.20596051, 0.99224973, 1.20815078, 1.18525751, 1.22631367,
       0.98866463, 1.18441502, 1.19293133, 1.24517862, 0.98396126,
       1.1717333 , 1.36825911, 1.36068082, 1.08120235, 1.17082397,
       1.22596161, 1.39264838, 1.0048426 , 1

## 2.	默写决策树的ID3,C4.5和基尼系数公式，并说明特点，默写预剪枝的手法，默写线性回归正规方程的推导过程

* ID3:使用信息增益作为评价标准，它会倾向于取值较多的特征，而且只能对离散行特征进行描述
* C4.5:是基于ID3做出的改进，使用信息增益率最为评价的标准来选择特征，它可以处理连续的数值型特征，并且能够对缺失值进行处理，缺点是算法的效率低
* 预剪枝：指定树的高度，指定每一个结点所包含的最小样本数，指定结点熵的最小值
* 线性回归：$y = W_0+W_1*X_1+W_2*X_1+...W_n*X_1$

## 3.	完成线性回归的实战，梯度下降线性回归实战

In [95]:
# 利用线性回归预测房价
house_price = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [24]:
print(house_price.data.shape)
print(house_price.target.shape)

(506, 13)
(506,)


In [96]:
# 数据集划分
x_train,x_test,y_train,y_test = train_test_split(
    house_price.data,house_price.target,
    test_size=0.25,random_state=22)

In [49]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(379, 13)
(379,)
(127, 13)
(127,)


In [97]:
# 进行标准化
stand = StandardScaler()
x_train = stand.fit_transform(x_train)
x_test = stand.transform(x_test)

In [101]:
# 因为数据集进行标准化之后是均值为0，方差1的数
# 但是目标值都是整数，所以要对目标值也进行标准化，使它们处于同一个量纲
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
y_std = StandardScaler()
y_train = y_std.fit_transform(y_train)
y_test = y_std.transform(y_test)

In [61]:
print(y_train)
print('-' * 30)
print(y_test)

[[ 1.28168690e-01]
 [ 2.36912665e-01]
 [-1.65440042e-01]
 [-5.13420761e-01]
 [ 2.81414486e+00]
 [ 2.27042499e+00]
 [-2.32407967e-03]
 [ 6.28390973e-01]
 [-1.76314439e-01]
 [ 2.79239607e+00]
 [-1.03539184e+00]
 [ 5.30521396e-01]
 [-9.37522262e-01]
 [ 2.97726083e+00]
 [-1.98063234e-01]
 [ 2.30304818e+00]
 [-4.48174376e-01]
 [-1.16588461e+00]
 [-1.21942452e-01]
 [ 1.47659398e+00]
 [ 9.76371692e-01]
 [-4.80797568e-01]
 [-1.02451744e+00]
 [ 1.93415075e-01]
 [-8.39652684e-01]
 [-6.75704645e-02]
 [ 9.21999705e-01]
 [-1.11068054e-01]
 [-9.04899069e-01]
 [ 4.32651819e-01]
 [ 2.41179216e+00]
 [-8.72275877e-01]
 [-1.00193657e-01]
 [-6.43913530e-01]
 [ 9.55454975e-02]
 [-5.89541543e-01]
 [ 1.16123645e+00]
 [ 2.61840571e+00]
 [ 1.15036205e+00]
 [-2.95932811e-01]
 [-4.37299978e-01]
 [ 2.36912665e-01]
 [-2.95932811e-01]
 [-1.07888943e+00]
 [ 5.20479077e-02]
 [-2.85058414e-01]
 [ 2.97726083e+00]
 [ 2.58661459e-01]
 [ 4.10903024e-01]
 [ 2.97726083e+00]
 [-4.48174376e-01]
 [-1.67698129e+00]
 [ 7.3796702

In [62]:
# 利用线性回归算法进行训练
linear = LinearRegression()
linear.fit(x_train,y_train)

In [63]:
# 计算预测值
y_predict = linear.predict(x_test)

In [64]:
# 模型评估,计算损失函数
error = mean_squared_error(y_test,y_predict)
print(error) # 把目标值进行标准化之后，mse从22下降到0.24

0.24392554883406567


In [21]:
# 查看回归系数和偏置
# 回归系数：正值：随着x的增大，y也增加   负值：随着x的增大，y值减小
print(linear.coef_)
print(linear.intercept_)

[-0.39752555  0.92469847  0.13115792  0.37677572 -1.94801169  2.96568984
  0.02786307 -2.80712543  2.75718715 -2.25871899 -2.25102836  1.18245304
 -3.43204224]
22.980211081794216


In [65]:
# 查看具体的房价,因为目标值进行了标准化
# 所以得出的房价是标准化之后的房价
# 要转换成标准化之前房价才是真实的房价
predict_house_price = y_std.inverse_transform(y_predict)
print(predict_house_price)

[[28.22944896]
 [31.5122308 ]
 [21.11612841]
 [32.6663189 ]
 [20.0023467 ]
 [19.07315705]
 [21.09772798]
 [19.61400153]
 [19.61907059]
 [32.87611987]
 [20.97911561]
 [27.52898011]
 [15.54701758]
 [19.78630176]
 [36.88641203]
 [18.81202132]
 [ 9.35912225]
 [18.49452615]
 [30.66499315]
 [24.30184448]
 [19.08220837]
 [34.11391208]
 [29.81386585]
 [17.51775647]
 [34.91026707]
 [26.54967053]
 [34.71035391]
 [27.4268996 ]
 [19.09095832]
 [14.92742976]
 [30.86877936]
 [15.88271775]
 [37.17548808]
 [ 7.72101675]
 [16.24074861]
 [17.19211608]
 [ 7.42140081]
 [20.0098852 ]
 [40.58481466]
 [28.93190595]
 [25.25404307]
 [17.74970308]
 [38.76446932]
 [ 6.87996052]
 [21.80450956]
 [25.29110265]
 [20.427491  ]
 [20.4698034 ]
 [17.25330064]
 [26.12442519]
 [ 8.48268143]
 [27.50871869]
 [30.58284841]
 [16.56039764]
 [ 9.38919181]
 [35.54434377]
 [32.29801978]
 [21.81298945]
 [17.60263689]
 [22.0804256 ]
 [23.49262401]
 [24.10617033]
 [20.1346492 ]
 [38.5268066 ]
 [24.58319594]
 [19.78072415]
 [13.93429

In [73]:
# 保存模型
joblib.dump(linear,"./test.pkl")

In [75]:
# 加载模型
model = joblib.load("./test.pkl")
y_predict = model.predict(x_test)
real_predict = y_std.inverse_transform(y_predict)
print(real_predict)

[[28.22944896]
 [31.5122308 ]
 [21.11612841]
 [32.6663189 ]
 [20.0023467 ]
 [19.07315705]
 [21.09772798]
 [19.61400153]
 [19.61907059]
 [32.87611987]
 [20.97911561]
 [27.52898011]
 [15.54701758]
 [19.78630176]
 [36.88641203]
 [18.81202132]
 [ 9.35912225]
 [18.49452615]
 [30.66499315]
 [24.30184448]
 [19.08220837]
 [34.11391208]
 [29.81386585]
 [17.51775647]
 [34.91026707]
 [26.54967053]
 [34.71035391]
 [27.4268996 ]
 [19.09095832]
 [14.92742976]
 [30.86877936]
 [15.88271775]
 [37.17548808]
 [ 7.72101675]
 [16.24074861]
 [17.19211608]
 [ 7.42140081]
 [20.0098852 ]
 [40.58481466]
 [28.93190595]
 [25.25404307]
 [17.74970308]
 [38.76446932]
 [ 6.87996052]
 [21.80450956]
 [25.29110265]
 [20.427491  ]
 [20.4698034 ]
 [17.25330064]
 [26.12442519]
 [ 8.48268143]
 [27.50871869]
 [30.58284841]
 [16.56039764]
 [ 9.38919181]
 [35.54434377]
 [32.29801978]
 [21.81298945]
 [17.60263689]
 [22.0804256 ]
 [23.49262401]
 [24.10617033]
 [20.1346492 ]
 [38.5268066 ]
 [24.58319594]
 [19.78072415]
 [13.93429

利用梯度下降进行线性回归

In [149]:
# 利用随机梯度下降进行训练
# sgd = SGDRegressor(eta0=0.009,penalty='l2',alpha=0.003) # 0.2616131486429884
# sgd = SGDRegressor(eta0=0.009,penalty='l1',alpha=0.002) # 0.25727545072652397
# sgd = SGDRegressor(eta0=0.009,penalty='l1',alpha=0.0005) # 0.25517272482694303
# sgd = SGDRegressor(eta0=0.011,penalty='l1',alpha=0.0005) # 0.25308785644460946
sgd = SGDRegressor(eta0=0.011,penalty='l1',alpha=0.0005)

In [150]:
sgd.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [100]:
# 目标值未进行标准化 # 21.30716625912824
y_predict = sgd.predict(x_test)
error = mean_squared_error(y_test,y_predict)
print(error)

21.30716625912824


In [151]:
# 目标值进行标准化之后，mse下降了很多
y_predict = sgd.predict(x_test)
error = mean_squared_error(y_test,y_predict)
print(error)

0.25838861014262077


##  4.	L1和L2正则化区别，正则化力度和权重系数关系是？ 均方误差怎么算？描述一下

L1正则化: 自动却掉没有用的特征，使模型偏向于稀疏
L2正则化：是为了防止过拟合，使模型变得更加的平滑
正则化力度：alpha越大，参数越趋近于0，alpha越小，参数的变化也就越小
均方误差mse: $MSE = 1/n\sum_{i=0}^n(y^i-y^-)$