<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#随机森林+网格搜索，线下CV得分3.6857" data-toc-modified-id="随机森林+网格搜索，线下CV得分3.6857-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>随机森林+网格搜索，线下CV得分3.6857</a></span><ul class="toc-item"><li><span><a href="#特征筛选-相关系数筛选" data-toc-modified-id="特征筛选-相关系数筛选-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>特征筛选-相关系数筛选</a></span></li><li><span><a href="#参数调优-网格搜索" data-toc-modified-id="参数调优-网格搜索-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>参数调优-网格搜索</a></span></li></ul></li></ul></div>

# 随机森林+网格搜索，线下CV得分3.6857

In [1]:
"""
特征组合：Dict+GroupBy
特征选择方式：Pearson
参数寻优办法：GridSearch
模型：randomforest
"""

'\n特征组合：Dict+GroupBy\n特征选择方式：Pearson\n参数寻优办法：GridSearch\n模型：randomforest\n'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


def read_data(debug=True):
    """
    读取数据
    :param debug:是否调试版，可以极大节省debug时间
    :return:训练集，测试集
    """

    print("read_data...")
    NROWS = 10000 if debug else None # 如果debug就只取前10000行 
    train_dict = pd.read_csv("preprocess/train_dict.csv", nrows=NROWS) #nrows=None是读取全部行
    test_dict = pd.read_csv("preprocess/test_dict.csv", nrows=NROWS)
    train_groupby = pd.read_csv("preprocess/train_groupby.csv", nrows=NROWS)
    test_groupby = pd.read_csv("preprocess/test_groupby.csv", nrows=NROWS)

    # 去除重复列
    for col in train_dict.columns:
        if col in train_groupby.columns and col!='card_id':
            del train_groupby[col]
    for col in test_dict.columns:
        if col in test_groupby.columns and col!='card_id':
            del test_groupby[col]

    # 拼接特征
    train = pd.merge(train_dict, train_groupby, how='left', on='card_id').fillna(0)
    test = pd.merge(test_dict, test_groupby, how='left', on='card_id').fillna(0)
    print("done")
    return train, test


## 特征筛选-相关系数筛选

In [3]:
def feature_select_pearson(train, test):
    #pearson相关系数就是最常见的相关系数
    """
    利用pearson相关系数进行相关性特征选择
    :param train:训练集
    :param test:测试集
    :return:经过特征选择后的训练集与测试集
    """
    print('feature_select...')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")
    featureSelect = features[:]

    # 去掉缺失值比例超过0.99的
    for feature in features:
        if train[feature].isnull().sum() / train.shape[0] >= 0.99:
            featureSelect.remove(feature)

    # 进行pearson相关性计算
    corr = []
    for feature in featureSelect:
        corr.append(abs(train[[feature, 'target']].fillna(0).corr().values[0][1]))#注意这个是2维array，类似于C++数组风格

    # 取top300的特征进行建模，具体数量可选
    se = pd.Series(corr, index=featureSelect).sort_values(ascending=False)
    feature_select = ['card_id'] + se[:300].index.tolist()
    print('done')
    return train[feature_select + ['target']], test[feature_select]


## 参数调优-网格搜索

<font color=red>网格搜索同样是交叉验证确定最优参数</font>

In [6]:
def param_grid_search(train):
    """
    网格搜索参数寻优
    :param train:训练集
    :return:最优的分类器模型
    """
    print('param_grid_search')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")
    
    parameter_space = {
        "n_estimators": [80],
        "min_samples_leaf": [30],
        "min_samples_split": [2],
        "max_depth": [9],
        "max_features": ['sqrt','log2',100]
    }

    print("Tuning hyper-parameters for mse")
    clf = RandomForestRegressor(
        criterion="squared_error",
        min_weight_fraction_leaf=0.,#类似于min_samples_leaf，不过这里按照权重而不是数目
        max_leaf_nodes=None,
        min_impurity_decrease=0., #设置最低不纯度，低于这个值就不再分叉了
        bootstrap=True,
        oob_score=False,
        n_jobs=8,
        random_state=2020,
        verbose=0,
        warm_start=False)
    grid = GridSearchCV(clf, parameter_space, cv=5, scoring="neg_mean_squared_error")
    grid.fit(train[features].values, train['target'].values)

    print("best_params_:")
    print(grid.best_params_)
    print('====================================================================')
    means = grid.cv_results_["mean_test_score"]#5折后的mean
    stds = grid.cv_results_["std_test_score"]
    i = 1
    for mean, std, params in zip(means, stds, grid.cv_results_["params"]):
        print(f'第{i}组参数:')
        i+=1
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    return grid.best_estimator_


In [7]:
def train_predict(train, test, best_clf):
    """
    进行训练和预测输出结果
    :param train:训练集
    :param test:测试集
    :param best_clf:最优的分类器模型
    :return:
    """
    print('train_predict...')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")

    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series()
    
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    for i,(train_part_index, eval_index) in enumerate(kf.split(train[features], train['target'])):
        best_clf.fit(train[features].loc[train_part_index].values, train['target'].loc[train_part_index].values)
        prediction_test += best_clf.predict(test[features].values)#这是预测test，跟val无关
        eval_pre = best_clf.predict(train[features].loc[eval_index].values)
        
        #评价函数
        score = np.sqrt(mean_squared_error(train['target'].loc[eval_index].values, eval_pre))
        cv_score.append(score)
        print(score)
        
        #拼接每次在验证集上的预测结果，得到全部训练集的预测结果
        prediction_train = pd.concat([prediction_train,pd.Series(best_clf.predict(train[features].loc[eval_index]),
                                                             index=eval_index)])
        print(f'第{i}次结束')
    print(cv_score, sum(cv_score) / 5)
    
    #保存
    pd.Series(prediction_train.sort_index().values).to_csv("preprocess/train_randomforest.csv", index=False)
    pd.Series(prediction_test / 5).to_csv("preprocess/test_randomforest.csv", index=False)
    
    #提交submission.csv
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv("result/submission_randomforest.csv", index=False)
    return


if __name__ == "__main__":

    # 获取训练集与测试集
    train, test = read_data(debug=False)

    # 获取特征选择结果
    train, test = feature_select_pearson(train, test)

    # 获取最优分类器模型
    best_clf = param_grid_search(train)

    # 获取结果
    train_predict(train, test, best_clf)
# [3.6952175995861753, 3.653405245049519, 3.711542672510601, 3.78859477721067, 3.586786511640954] 3.687109361199584


read_data...
done
feature_select...
done
param_grid_search
Tuning hyper-parameters for mse
best_params_:
{'max_depth': 9, 'max_features': 100, 'min_samples_leaf': 30, 'min_samples_split': 2, 'n_estimators': 80}
第1组参数:
-13.720 (+/-0.110) for {'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 30, 'min_samples_split': 2, 'n_estimators': 80}
第2组参数:
-13.862 (+/-0.115) for {'max_depth': 9, 'max_features': 'log2', 'min_samples_leaf': 30, 'min_samples_split': 2, 'n_estimators': 80}
第3组参数:
-13.631 (+/-0.087) for {'max_depth': 9, 'max_features': 100, 'min_samples_leaf': 30, 'min_samples_split': 2, 'n_estimators': 80}
train_predict...
3.6900584289655423
第0次结束


  prediction_train = pd.concat([prediction_train,pd.Series(best_clf.predict(train[features].loc[eval_index]),


3.650674457367465
第1次结束




3.713631059161725
第2次结束




3.7894171753660055
第3次结束




3.5846883710434545
第4次结束
[3.6900584289655423, 3.650674457367465, 3.713631059161725, 3.7894171753660055, 3.5846883710434545] 3.685693898380839


