题目：

kaggle上的一道练习题，将所给的数据先进行预处理，再通过机器学习模型完成对数据集的学习和拟合，是分类精确度最高

大致分为：数据预处理和模型拟合两部分。

## 1、导入库

In [1]:
import xgboost as xgb
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import csv

## 2、数据预处理

In [2]:
# 定义载入数据函数（包括预处理在内）
def load_data(file_name, is_train):
    data = pd.read_csv(file_name)  # 数据文件路径
    print ('data.describe() = \n',data.describe()) # 打印各个数据的描述（均值、平均数、方差、中位数等）

    # 性别
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int) # 男性映射为1，女性为0

    # 补齐船票价格缺失值
    if len(data.Fare[data.Fare == 0]) > 0:  # 将船票价格为0的拿出来
        fare = np.zeros(3)
        for f in range(0, 3):               # 分别算出1、2、3等仓的中位数
            fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
        for f in range(0, 3):               # 将空缺船票的位置用该仓的中位数代替
            data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]

    # 年龄：使用均值代替缺失值（比较垃圾）
    # mean_age = data['Age'].dropna().mean()
    # data.loc[(data.Age.isnull()), 'Age'] = mean_age
    
    # 年龄：使用随机森林预测年龄缺失值
    if is_train:
        
        # 年龄预测的预处理工作
        print ('随机森林预测缺失年龄：--start--')
        data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]   # 年龄不缺失的数据（训练集）
        age_null = data_for_age.loc[(data.Age.isnull())]     # 年龄为空的部分（测试集）
        # print (age_exist)
        x = age_exist.values[:, 1:]     # 除了年龄那一列的作为x
        y = age_exist.values[:, 0]      # 年龄作为y（需要预测）
        
        # 用随机森林做预测
        rfr = RandomForestRegressor(n_estimators=100)  
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print (age_hat)
        data.loc[(data.Age.isnull()), 'Age'] = age_hat  # 只要age为空就进行补充
        print ('随机森林预测缺失年龄：--over--')
    else:
        # 对于测试数据中没有'Survived'这一列的处理方法（和之前的方法是一样的）
        print ('随机森林预测缺失年龄2：--start--')
        data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]
        # print (age_exist)
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print (age_hat)
        data.loc[(data.Age.isnull()), 'Age'] = age_hat   # 只要age为空就进行补充
        print ('随机森林预测缺失年龄2：--over--')

    # 起始城市（保存为4列，类似 one-hot 编码）
    data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
    # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
    # print (data['Embarked'])
    embarked_data = pd.get_dummies(data.Embarked)
    # print (embarked_data)
    # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
    embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
    data = pd.concat([data, embarked_data], axis=1)
    print (data.describe())
    
    # 保存为一个新的文件 'New_Data.csv' 这是一个经过处理之后的文件
    data.to_csv('New_Data.csv')
    x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    y = None
    if 'Survived' in data:
        y = data['Survived']

    # 将数据转换为numpy的格式，方便操作
    x = np.array(x)
    y = np.array(y)

    # 将 x 、 y拷贝5份作为训练数据，增加精确率
    x = np.tile(x, (5, 1))
    y = np.tile(y, (5, ))
    if is_train:
        return x, y
    return x, data['PassengerId']

## 3、使用模型学习

定义得分函数

In [3]:
def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    acc_rate = 100 * float(acc.sum()) / a.size
    # print ('%s正确率：%.3f%%' % (tip, acc_rate))
    return acc_rate

定义记录结果函数（这里没有用）

In [4]:
def write_result(c, c_type):
    file_name = '12.Titanic.test.csv'
    x, passenger_id = load_data(file_name, False)

    if type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0

    predictions_file = open("Prediction_%d.csv" % c_type, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()

进行学习

In [5]:
# 读入数据
x, y = load_data('12.Titanic.train.csv', True)
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=1)

data.describe() = 
        PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
随机森林预测缺失年龄：--start--
随机森林预测缺失年龄：--over--
       PassengerId  

使用Logistic回归

In [6]:
# 创建模型，使用 l2 正则项
lr = LogisticRegression(penalty='l2') 
# 拟合模型
lr.fit(x_train, y_train)
# 预测数据
y_hat = lr.predict(x_test)
# 模型得分
lr_rate = show_accuracy(y_hat, y_test, 'Logistic回归 ')
# write_result(lr, 1)
print ('Logistic回归：%.3f%%' % lr_rate)

Logistic回归：79.129%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


使用随机森林

In [7]:
# 构造随机森林（20颗树）
rfc = RandomForestClassifier(n_estimators=20)
# 拟合模型
rfc.fit(x_train, y_train)
# 预测数据
y_hat = rfc.predict(x_test)
# 模型得分
rfc_rate = show_accuracy(y_hat, y_test, '随机森林 ')
# write_result(rfc, 2)
print ('随机森林：%.3f%%' % rfc_rate)

随机森林：97.397%


使用XGBoost

In [8]:
# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 3, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'}  # 将需要设置的参数设置一下
        # 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}

# 学习数据（打印出每次迭代之后的训练集合测试集上的误差）
bst = xgb.train(param, data_train, num_boost_round=500, evals=watch_list)  # 这里做500次迭代 
# 预测数据
y_hat = bst.predict(data_test)
# write_result(bst, 3)
y_hat[y_hat > 0.5] = 1
y_hat[~(y_hat > 0.5)] = 0
# 模型得分
xgb_rate = show_accuracy(y_hat, y_test, 'XGBoost ')
print ('XGBoost：%.3f%%' % xgb_rate)

[0]	eval-error:0.194345	train-error:0.15806
[1]	eval-error:0.194345	train-error:0.15806
[2]	eval-error:0.194345	train-error:0.15806
[3]	eval-error:0.194345	train-error:0.15806
[4]	eval-error:0.194345	train-error:0.15806
[5]	eval-error:0.192549	train-error:0.157611
[6]	eval-error:0.192549	train-error:0.157611
[7]	eval-error:0.185817	train-error:0.153121
[8]	eval-error:0.185817	train-error:0.153121
[9]	eval-error:0.185817	train-error:0.153121
[10]	eval-error:0.185817	train-error:0.153121
[11]	eval-error:0.185817	train-error:0.153121
[12]	eval-error:0.185817	train-error:0.153121
[13]	eval-error:0.185817	train-error:0.153121
[14]	eval-error:0.185817	train-error:0.153121
[15]	eval-error:0.185817	train-error:0.153121
[16]	eval-error:0.175494	train-error:0.152223
[17]	eval-error:0.175494	train-error:0.152223
[18]	eval-error:0.175494	train-error:0.152223
[19]	eval-error:0.175494	train-error:0.152223
[20]	eval-error:0.175494	train-error:0.152223
[21]	eval-error:0.175045	train-error:0.150427
[22

[178]	eval-error:0.112208	train-error:0.078581
[179]	eval-error:0.114004	train-error:0.07903
[180]	eval-error:0.114004	train-error:0.07903
[181]	eval-error:0.112208	train-error:0.078581
[182]	eval-error:0.113106	train-error:0.077683
[183]	eval-error:0.114004	train-error:0.07903
[184]	eval-error:0.114004	train-error:0.07903
[185]	eval-error:0.110862	train-error:0.077683
[186]	eval-error:0.109066	train-error:0.077234
[187]	eval-error:0.106373	train-error:0.075438
[188]	eval-error:0.106373	train-error:0.075438
[189]	eval-error:0.106373	train-error:0.075438
[190]	eval-error:0.108169	train-error:0.075887
[191]	eval-error:0.107271	train-error:0.07454
[192]	eval-error:0.107271	train-error:0.07454
[193]	eval-error:0.107271	train-error:0.07454
[194]	eval-error:0.107271	train-error:0.07454
[195]	eval-error:0.107271	train-error:0.07454
[196]	eval-error:0.109066	train-error:0.074989
[197]	eval-error:0.107271	train-error:0.07454
[198]	eval-error:0.107271	train-error:0.07454
[199]	eval-error:0.10727

[354]	eval-error:0.076302	train-error:0.047149
[355]	eval-error:0.076302	train-error:0.047149
[356]	eval-error:0.076302	train-error:0.047149
[357]	eval-error:0.074955	train-error:0.046251
[358]	eval-error:0.074955	train-error:0.046251
[359]	eval-error:0.074955	train-error:0.046251
[360]	eval-error:0.074955	train-error:0.046251
[361]	eval-error:0.078097	train-error:0.047598
[362]	eval-error:0.07675	train-error:0.0467
[363]	eval-error:0.07675	train-error:0.0467
[364]	eval-error:0.07675	train-error:0.0467
[365]	eval-error:0.07675	train-error:0.0467
[366]	eval-error:0.078097	train-error:0.047598
[367]	eval-error:0.078097	train-error:0.047598
[368]	eval-error:0.076302	train-error:0.047149
[369]	eval-error:0.076302	train-error:0.047149
[370]	eval-error:0.076302	train-error:0.047149
[371]	eval-error:0.076302	train-error:0.047149
[372]	eval-error:0.076302	train-error:0.047149
[373]	eval-error:0.076302	train-error:0.047149
[374]	eval-error:0.076302	train-error:0.047149
[375]	eval-error:0.076302

比较3种方法的结果

In [9]:
print ('Logistic回归：%.3f%%' % lr_rate)
print ('随机森林：%.3f%%' % rfc_rate)
print ('XGBoost：%.3f%%' % xgb_rate)

Logistic回归：79.129%
随机森林：97.397%
XGBoost：94.345%


总结：可见，随机森林（20颗树）和XGBoost（500次迭代）的分类准确率还是要优于Logistic回归的