# 银行产品推荐

* Content：桑坦德银行产品推荐,使用xgboost/catboost预测2016年6月用户会购买的新产品,评估指标为MAP@7
* Author:  HuiHui
* Date:    2020-04-23
* Reference:https://www.kaggle.com/c/santander-product-recommendation/data
* 数据集：桑坦德银行2015.1.28-2016.5.28的客户行为数据
    * ncodpers: 客户代码

In [1]:
from google.colab import drive
drive.mount('/content/gdrive') #挂载网盘

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
os.chdir("/content/gdrive/My Drive/RS/Santander Product Recommendation") #改变当前工作目录到指定的路径

## 安装依赖包

In [0]:
!pip install -q tqdm
!pip install -q xgboost

## 导入需要的库

In [0]:
import gc
from tqdm import tqdm
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb

## 获取数据

In [10]:
# 加载全量样本
train = pd.read_csv('./santander-product-recommendation/train_ver2.csv',dtype = {'age': 'str', 'antiguedad': 'str','renta': 'str'})
test = pd.read_csv('./santander-product-recommendation/test_ver2.csv',dtype = {'age': 'str', 'antiguedad': 'str','renta': 'str'})

# # 加载小样本
# train = pd.read_csv('./santander-product-recommendation-small/train.csv',dtype = {'age': 'str', 'antiguedad': 'str','renta': 'str'})
# test = pd.read_csv('./santander-product-recommendation-small/test.csv',dtype = {'age': 'str', 'antiguedad': 'str','renta': 'str')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


##数据清洗

In [11]:
# test、train数据拼接
train['origin'] = 'train'
test['origin'] = 'test'
matrix = pd.concat([train,test], ignore_index=True, sort=False) 

del train, test
gc.collect() #清理内存

# 数据初探
# matrix.head()
# matrix.info()
# pd.set_option('display.max_columns', None) # 强制输出所有属性值
# print(matrix.describe(include='all'))

# （1）特征初筛
# 初步筛选出相关性高的特征列
mapping_dict = {
'sexo'          : {'nan':0,'H':0, 'V':1},
'ind_actividad_cliente' : {'nan':0, '0.0':0, '0':0,'1.0':1, '1':1},
'segmento'      : {'nan':0, '01 - TOP':0, '03 - UNIVERSITARIO':1, '02 - PARTICULARES':2},
'ind_nuevo'     : {'nan':0, '1.0':0, '1':0,  '0.0':1, '0':1 },
'tiprel_1mes'   : {'nan':0, 'P':0, 'R':0, 'N':0, 'I':1, 'A':2},
'indext'        : {'nan':0, 'S':0, 'N':1}
} # 分类特征值转换

target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',  'ind_cder_fin_ult1',
                   'ind_cno_fin_ult1',  'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
                   'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
                   'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',  'ind_plan_fin_ult1',
                   'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
                   'ind_viv_fin_ult1',  'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']

target_cols = target_raw_cols[2:] # 储蓄账户及担保不算target？？？
con_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad','renta']
cat_cols = list(mapping_dict.keys())
user_cols = con_cols + cat_cols + target_raw_cols +["origin"] # 通用特征列+分类特征列+目标列
print(user_cols)
matrix=matrix[user_cols]

# （2）处理缺失值
# 随机抽取数据集中的10条数据观察
# np.random.seed(0) 
# train.sample(10) 
# 统计缺失值的数量
missing_values_count = matrix.isnull().sum() 
# print(missing_values_count)
# 缺失值的数目占总数的百分比
total_cells = np.product(matrix.shape)
total_missing = missing_values_count.sum()
print((total_missing/total_cells) * 100)
# 分析出现缺失值的原因:没有被记录or根本不存在？--> 剔除缺失值/补全缺失值
# 中位数补充renta
# matrix["renta"].fillna(matrix["renta"].median(),inplace=True)
# 其余缺失值在格式化时补全

# （3）数据格式化
matrix['fecha_dato'] = pd.to_datetime(matrix['fecha_dato'],format = "%Y/%m/%d",errors='coerce') # r日期转换错误的会设置为NaN,可后期手动转换
# print(matrix['fecha_dato'].loc[matrix['fecha_dato'].isnull()]) #查看是否有没有成功转换的日期

def getAge(str_age):
  age = str_age.strip()
  if age == 'NA' or age == 'nan':
    age1 = 2
  elif float(age) < 20:
    age1 = 0
  elif float(age) < 30:
    age1 = 1
  elif float(age) < 40:
    age1 = 2
  elif float(age) < 50:
    age1 = 3
  elif float(age) < 60:
    age1 = 4
  else:
    age1 =  5
  return age1
matrix['age'] = matrix['age'].apply(lambda x: getAge(x))

def getCustSeniority(str_seniority):
  cust_seniority = str_seniority.strip()
  if cust_seniority == 'NA' or cust_seniority == 'nan':
    seniority = 4
  elif float(cust_seniority) < 50:
    seniority = 0
  elif float(cust_seniority) < 75:
    seniority = 1
  elif float(cust_seniority) < 100:
    seniority = 2
  elif float(cust_seniority) < 125:
    seniority = 3
  elif float(cust_seniority) < 150:
    seniority = 4
  elif float(cust_seniority) < 175:
    seniority = 5
  elif float(cust_seniority) < 200:
    seniority = 6
  elif float(cust_seniority) < 225:
    seniority = 7
  else:
    seniority = 8
  return seniority
matrix['antiguedad'] = matrix['antiguedad'].apply(lambda x: getCustSeniority(x))

def getRent(rent):
  rent=str(rent)
  rent = rent.strip() # 把字符串(str)的头和尾的空格，以及位于头尾的\n \t之类给删掉
  if rent == 'NA' or rent == 'nan':
    rent1 = 4
  elif float(rent) < 45542.97:
    rent1 = 1
  elif float(rent) < 57629.67:
    rent1 = 2
  elif float(rent) < 68211.78:
    rent1 = 3
  elif float(rent) < 78852.39:
    rent1 = 4
  elif float(rent) < 90461.97:
    rent1 = 5
  elif float(rent) < 103855.23:
    rent1 = 6
  elif float(rent) < 120063.00:
    rent1 = 7
  elif float(rent) < 141347.49:
    rent1 = 8
  elif float(rent) < 173418.12:
    rent1 = 9
  elif float(rent) < 234687.12:
    rent1 = 10
  else:
    rent1 = 11
  return rent1
matrix['renta'] = matrix['renta'].apply(lambda x: getRent(x))

for col in cat_cols:
  matrix[col] = matrix[col].apply(lambda x: mapping_dict[col][str(x)])

for col in  target_raw_cols:
  matrix[col].fillna(0, inplace=True)
matrix.info()

['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta', 'sexo', 'ind_actividad_cliente', 'segmento', 'ind_nuevo', 'tiprel_1mes', 'indext', 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1', 'origin']
4.876370198388753
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14576924 entries, 0 to 14576923
Data columns (total 36 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   ncodpers               int64         
 1   fecha_dato             datetime64[ns]
 2   age                    int64         
 3   a

## 数据探索（可视化）

In [0]:
# 样本数据是否均衡

## 特征工程  
* 特征构建、特征生成、特征选择

In [12]:
# 特征构建：用2015年1-5月产品购买记录和6月公共特征作为训练数据X_train，2015年6月作为y_train_sets(24个目标列);用2016年1-5月产品购买记录和6月公共特征作为测试数据X_test,预测2016年6月购买产品情况
'''
后续可考虑构建这些特征
ind_(xyz)_ult1_last：产品的最后一个月指数（滞后1）
ind_(xyz)_ult1_00：索引从0到0到上个月的过渡次数
ind_(xyz)_ult1_01：索引从0到1到上个月的过渡次数
ind_(xyz)_ult1_10：索引从1到0到上个月的过渡次数
ind_(xyz)_ult1_11：索引从1到1到上个月的过渡次数
ind_(xyz)_ult1_0len：直到上个月的连续0索引的长度
products_last：产品上月指数的串联
n_products_last：上个月购买的产品数量
'''

# 处理训练数据及测试数据
# 返回X_TRAIN,y_train_sets,X_test
def process_data(matrix): 
  # 全量样本时
  train_date_list=['2015-06-28', '2015-05-28', '2015-04-28','2015-03-28', '2015-02-28', '2015-01-28']
  test_date_list=['2016-06-28','2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28']

  # # 小样本时（小样本中训练数据到2016年4月，测试数据为2016年5月，因此稍作修改，最终预测的结果为2016年5月购买新产品情况）
  # train_date_list=['2015-05-28', '2015-04-28','2015-03-28', '2015-02-28', '2015-01-28']
  # test_date_list=['2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28']

  train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
  test_data = matrix[matrix['origin'] == 'test'].drop(target_raw_cols+['origin'], axis=1)

  # 处理训练数据
  this_month = train_data[train_data['fecha_dato'].isin([train_date_list[0]])] #选取2015-06月（当月）数据
  hist_data = train_data.loc[:,['ncodpers','fecha_dato'] + target_raw_cols] #产品历史购买记录

  # 加入产品历史购买记录数据
  for i in range(1, len(train_date_list)):
    pre_month = hist_data[hist_data['fecha_dato'].isin([train_date_list[i]])].loc[:,['ncodpers'] + target_raw_cols] # 上个月产品购买记录
    pre_month = pre_month.add_prefix(str(i) + "_") # 对上个月产品购买记录加前缀
    pre_month.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True) # 去掉'ncodpers'刚加的前缀，以便之后合并数据
    this_month = pd.merge(this_month, pre_month, on=['ncodpers'], how='left') #当月数据+上个月产品购买记录
  this_month.fillna(0, inplace=True)

  X_TRAIN=this_month.drop(target_raw_cols+["fecha_dato"], axis=1)
  y_train_sets=this_month[target_raw_cols] # 24个目标列，分别用XGBoost预测购买概率

  # 处理测试数据
  this_month_t = test_data[test_data['fecha_dato'].isin([test_date_list[0]])] #选取2016-06月数据
  # this_month_t = test_data
  for i in range(1, len(test_date_list)):
    pre_month_t = hist_data[hist_data['fecha_dato'].isin([test_date_list[i]])].loc[:,['ncodpers'] + target_raw_cols] # 上个月产品购买记录
    pre_month_t = pre_month_t.add_prefix(str(i) + "_") # 对上个月产品购买记录加前缀
    pre_month_t.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True) # 去掉'ncodpers'刚加的前缀，以便之后合并数据
    this_month_t = pd.merge(this_month_t, pre_month_t, on=['ncodpers'], how='left') #当月数据+上个月产品购买记录
  this_month_t.fillna(0, inplace=True)

  X_test=this_month_t.drop(["fecha_dato"], axis=1)

  # 清理内存
  del matrix,train_data,test_data,this_month,hist_data,pre_month,this_month_t,pre_month_t
  gc.collect()

  return X_TRAIN,y_train_sets,X_test

X_TRAIN,y_train_sets,X_test = process_data(matrix)

# 保存特征文件，调试不同算法模型
X_TRAIN.to_csv('./X_TRAIN.csv', index=False)
y_train_sets.to_csv('./y_train_sets.csv', index=False)
X_test.to_csv('./X_test.csv', index=False)

print("ok")

ok


## 使用XGBoost模型

In [13]:
target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',  'ind_cder_fin_ult1',
                   'ind_cno_fin_ult1',  'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
                   'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
                   'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',  'ind_plan_fin_ult1',
                   'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
                   'ind_viv_fin_ult1',  'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']
# 读取保存的特征文件
X_TRAIN=pd.read_csv('./X_TRAIN.csv')
y_train_sets=pd.read_csv('./y_train_sets.csv')
X_test=pd.read_csv('./X_test.csv')

# 分别用XGBoost预测24个产品的购买概率
def runXGB(X_train, X_valid, y_train, y_valid,X_test,seed_val=42):
  model = xgb.XGBClassifier(
      max_depth=8,
      n_estimators=1000,
      min_child_weight=300, 
      colsample_bytree=0.8, 
      subsample=0.8, 
      eta=0.35, #学习率调小，训练误差减小，泛化误差增大   
      seed=seed_val #每次产生的随机数一样；这里设置随机数种子是因为训练过程中采用了随机采样、随机属性选择
      )
  model.fit(
      X_train, y_train,
      eval_metric='logloss', eval_set=[(X_train, y_train), (X_valid, y_valid)],
      verbose=True,
      #早停法，如果eval_metric在10epoch没有进步就stop
      early_stopping_rounds=10
      )
  prob = pd.Series(model.predict_proba(X_test)[:,1]) #购买该产品的概率预测
  return prob

result=X_test[["ncodpers"]]
for y in tqdm(target_raw_cols): # 进度条
  # 将训练集进行切分，20%用于验证
  print(y)
  X_train, X_valid, y_train, y_valid = train_test_split(X_TRAIN, y_train_sets[y], test_size=0.2)
  result[y]=runXGB(X_train, X_valid, y_train, y_valid, X_test)

  0%|          | 0/24 [00:00<?, ?it/s]

ind_ahor_fin_ult1
[0]	validation_0-logloss:0.598188	validation_1-logloss:0.598189
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520422	validation_1-logloss:0.520423
[2]	validation_0-logloss:0.455571	validation_1-logloss:0.455573
[3]	validation_0-logloss:0.400756	validation_1-logloss:0.400759
[4]	validation_0-logloss:0.353934	validation_1-logloss:0.353937
[5]	validation_0-logloss:0.313599	validation_1-logloss:0.313603
[6]	validation_0-logloss:0.278618	validation_1-logloss:0.278622
[7]	validation_0-logloss:0.248109	validation_1-logloss:0.248114
[8]	validation_0-logloss:0.221376	validation_1-logloss:0.221382
[9]	validation_0-logloss:0.197856	validation_1-logloss:0.197862
[10]	validation_0-logloss:0.177096	validation_1-logloss:0.177103
[11]	validation_0-logloss:0.158719	validation_1-logloss:0.158726
[12]	validation_0-logloss:0.14241	validation_1-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  4%|▍         | 1/24 [01:50<42:20, 110.45s/it]

ind_aval_fin_ult1
[0]	validation_0-logloss:0.598151	validation_1-logloss:0.598155
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520352	validation_1-logloss:0.520359
[2]	validation_0-logloss:0.455476	validation_1-logloss:0.455486
[3]	validation_0-logloss:0.400638	validation_1-logloss:0.400651
[4]	validation_0-logloss:0.353793	validation_1-logloss:0.353808
[5]	validation_0-logloss:0.31344	validation_1-logloss:0.313458
[6]	validation_0-logloss:0.278441	validation_1-logloss:0.278462
[7]	validation_0-logloss:0.247915	validation_1-logloss:0.247938
[8]	validation_0-logloss:0.221165	validation_1-logloss:0.22119
[9]	validation_0-logloss:0.197632	validation_1-logloss:0.197659
[10]	validation_0-logloss:0.176857	validation_1-logloss:0.176887
[11]	validation_0-logloss:0.158465	validation_1-logloss:0.158496
[12]	validation_0-logloss:0.142141	validation_1-l

  8%|▊         | 2/24 [03:30<39:22, 107.38s/it]

ind_cco_fin_ult1
[0]	validation_0-logloss:0.604221	validation_1-logloss:0.604186
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.531364	validation_1-logloss:0.53132
[2]	validation_0-logloss:0.470484	validation_1-logloss:0.470429
[3]	validation_0-logloss:0.419246	validation_1-logloss:0.419195
[4]	validation_0-logloss:0.376088	validation_1-logloss:0.376048
[5]	validation_0-logloss:0.338449	validation_1-logloss:0.338407
[6]	validation_0-logloss:0.305929	validation_1-logloss:0.305883
[7]	validation_0-logloss:0.277441	validation_1-logloss:0.277382
[8]	validation_0-logloss:0.252633	validation_1-logloss:0.252585
[9]	validation_0-logloss:0.230971	validation_1-logloss:0.230924
[10]	validation_0-logloss:0.21193	validation_1-logloss:0.211877
[11]	validation_0-logloss:0.195085	validation_1-logloss:0.19503
[12]	validation_0-logloss:0.180219	validation_1-log

 12%|█▎        | 3/24 [35:45<3:49:30, 655.73s/it]

ind_cder_fin_ult1
[0]	validation_0-logloss:0.598333	validation_1-logloss:0.598329
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520682	validation_1-logloss:0.520674
[2]	validation_0-logloss:0.455936	validation_1-logloss:0.455924
[3]	validation_0-logloss:0.401211	validation_1-logloss:0.401196
[4]	validation_0-logloss:0.354466	validation_1-logloss:0.354447
[5]	validation_0-logloss:0.314205	validation_1-logloss:0.314183
[6]	validation_0-logloss:0.27929	validation_1-logloss:0.279266
[7]	validation_0-logloss:0.248842	validation_1-logloss:0.248814
[8]	validation_0-logloss:0.222169	validation_1-logloss:0.222139
[9]	validation_0-logloss:0.198707	validation_1-logloss:0.198673
[10]	validation_0-logloss:0.178003	validation_1-logloss:0.177967
[11]	validation_0-logloss:0.159679	validation_1-logloss:0.159639
[12]	validation_0-logloss:0.143419	validation_1-

 17%|█▋        | 4/24 [38:14<2:47:51, 503.59s/it]

ind_cno_fin_ult1
[0]	validation_0-logloss:0.601354	validation_1-logloss:0.601404
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.526187	validation_1-logloss:0.526267
[2]	validation_0-logloss:0.463374	validation_1-logloss:0.463477
[3]	validation_0-logloss:0.410318	validation_1-logloss:0.410456
[4]	validation_0-logloss:0.365134	validation_1-logloss:0.365295
[5]	validation_0-logloss:0.326307	validation_1-logloss:0.3265
[6]	validation_0-logloss:0.292509	validation_1-logloss:0.292719
[7]	validation_0-logloss:0.263053	validation_1-logloss:0.263289
[8]	validation_0-logloss:0.2373	validation_1-logloss:0.237553
[9]	validation_0-logloss:0.214713	validation_1-logloss:0.214991
[10]	validation_0-logloss:0.194764	validation_1-logloss:0.195075
[11]	validation_0-logloss:0.177239	validation_1-logloss:0.177593
[12]	validation_0-logloss:0.161615	validation_1-logl

 21%|██        | 5/24 [1:00:47<4:00:12, 758.56s/it]

ind_ctju_fin_ult1
[0]	validation_0-logloss:0.598223	validation_1-logloss:0.598219
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520482	validation_1-logloss:0.520474
[2]	validation_0-logloss:0.45564	validation_1-logloss:0.455629
[3]	validation_0-logloss:0.400833	validation_1-logloss:0.400819
[4]	validation_0-logloss:0.354015	validation_1-logloss:0.353999
[5]	validation_0-logloss:0.313684	validation_1-logloss:0.313666
[6]	validation_0-logloss:0.278704	validation_1-logloss:0.278685
[7]	validation_0-logloss:0.248195	validation_1-logloss:0.248174
[8]	validation_0-logloss:0.221461	validation_1-logloss:0.221439
[9]	validation_0-logloss:0.197943	validation_1-logloss:0.197921
[10]	validation_0-logloss:0.177175	validation_1-logloss:0.177152
[11]	validation_0-logloss:0.158794	validation_1-logloss:0.15877
[12]	validation_0-logloss:0.142482	validation_1-l

 25%|██▌       | 6/24 [1:03:11<2:52:10, 573.94s/it]

ind_ctma_fin_ult1
[0]	validation_0-logloss:0.598589	validation_1-logloss:0.598618
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.521149	validation_1-logloss:0.521204
[2]	validation_0-logloss:0.456577	validation_1-logloss:0.456655
[3]	validation_0-logloss:0.402004	validation_1-logloss:0.402102
[4]	validation_0-logloss:0.355401	validation_1-logloss:0.355518
[5]	validation_0-logloss:0.315394	validation_1-logloss:0.315525
[6]	validation_0-logloss:0.280776	validation_1-logloss:0.280931
[7]	validation_0-logloss:0.250371	validation_1-logloss:0.250541
[8]	validation_0-logloss:0.223742	validation_1-logloss:0.223926
[9]	validation_0-logloss:0.20042	validation_1-logloss:0.200616
[10]	validation_0-logloss:0.179731	validation_1-logloss:0.17994
[11]	validation_0-logloss:0.161424	validation_1-logloss:0.161645
[12]	validation_0-logloss:0.14528	validation_1-lo

 29%|██▉       | 7/24 [1:05:50<2:07:23, 449.62s/it]

ind_ctop_fin_ult1
[0]	validation_0-logloss:0.598583	validation_1-logloss:0.598596
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.521132	validation_1-logloss:0.521157
[2]	validation_0-logloss:0.456554	validation_1-logloss:0.45659
[3]	validation_0-logloss:0.401989	validation_1-logloss:0.402034
[4]	validation_0-logloss:0.355381	validation_1-logloss:0.355436
[5]	validation_0-logloss:0.3154	validation_1-logloss:0.31545
[6]	validation_0-logloss:0.280566	validation_1-logloss:0.280626
[7]	validation_0-logloss:0.250195	validation_1-logloss:0.250264
[8]	validation_0-logloss:0.223595	validation_1-logloss:0.223673
[9]	validation_0-logloss:0.200207	validation_1-logloss:0.200293
[10]	validation_0-logloss:0.179577	validation_1-logloss:0.179672
[11]	validation_0-logloss:0.161324	validation_1-logloss:0.161426
[12]	validation_0-logloss:0.14513	validation_1-logl

 33%|███▎      | 8/24 [1:08:26<1:36:24, 361.52s/it]

ind_ctpp_fin_ult1
[0]	validation_0-logloss:0.598407	validation_1-logloss:0.598402
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.52083	validation_1-logloss:0.52082
[2]	validation_0-logloss:0.45614	validation_1-logloss:0.456125
[3]	validation_0-logloss:0.40146	validation_1-logloss:0.40144
[4]	validation_0-logloss:0.354813	validation_1-logloss:0.354775
[5]	validation_0-logloss:0.314575	validation_1-logloss:0.314533
[6]	validation_0-logloss:0.27969	validation_1-logloss:0.279644
[7]	validation_0-logloss:0.249271	validation_1-logloss:0.249221
[8]	validation_0-logloss:0.222622	validation_1-logloss:0.222569
[9]	validation_0-logloss:0.199184	validation_1-logloss:0.199128
[10]	validation_0-logloss:0.178498	validation_1-logloss:0.178438
[11]	validation_0-logloss:0.16019	validation_1-logloss:0.160117
[12]	validation_0-logloss:0.14395	validation_1-logloss

 38%|███▊      | 9/24 [1:10:38<1:13:08, 292.55s/it]

ind_deco_fin_ult1
[0]	validation_0-logloss:0.598731	validation_1-logloss:0.598703
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.521604	validation_1-logloss:0.521571
[2]	validation_0-logloss:0.457293	validation_1-logloss:0.457255
[3]	validation_0-logloss:0.402939	validation_1-logloss:0.402897
[4]	validation_0-logloss:0.356519	validation_1-logloss:0.356473
[5]	validation_0-logloss:0.316339	validation_1-logloss:0.31627
[6]	validation_0-logloss:0.281499	validation_1-logloss:0.281407
[7]	validation_0-logloss:0.25112	validation_1-logloss:0.251008
[8]	validation_0-logloss:0.224507	validation_1-logloss:0.224378
[9]	validation_0-logloss:0.201256	validation_1-logloss:0.201135
[10]	validation_0-logloss:0.18058	validation_1-logloss:0.180441
[11]	validation_0-logloss:0.162398	validation_1-logloss:0.162254
[12]	validation_0-logloss:0.146147	validation_1-lo

 42%|████▏     | 10/24 [1:17:04<1:14:49, 320.68s/it]

ind_deme_fin_ult1
[0]	validation_0-logloss:0.598994	validation_1-logloss:0.599024
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.521886	validation_1-logloss:0.521944
[2]	validation_0-logloss:0.457602	validation_1-logloss:0.457687
[3]	validation_0-logloss:0.403286	validation_1-logloss:0.403394
[4]	validation_0-logloss:0.356909	validation_1-logloss:0.357039
[5]	validation_0-logloss:0.316975	validation_1-logloss:0.317126
[6]	validation_0-logloss:0.282365	validation_1-logloss:0.282536
[7]	validation_0-logloss:0.2522	validation_1-logloss:0.252391
[8]	validation_0-logloss:0.225799	validation_1-logloss:0.226006
[9]	validation_0-logloss:0.202587	validation_1-logloss:0.202811
[10]	validation_0-logloss:0.182125	validation_1-logloss:0.182366
[11]	validation_0-logloss:0.164028	validation_1-logloss:0.164285
[12]	validation_0-logloss:0.147982	validation_1-l

 46%|████▌     | 11/24 [1:21:18<1:05:08, 300.64s/it]

ind_dela_fin_ult1
[0]	validation_0-logloss:0.599425	validation_1-logloss:0.599445
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.523386	validation_1-logloss:0.523462
[2]	validation_0-logloss:0.45928	validation_1-logloss:0.459373
[3]	validation_0-logloss:0.405746	validation_1-logloss:0.405885
[4]	validation_0-logloss:0.360073	validation_1-logloss:0.360254
[5]	validation_0-logloss:0.320121	validation_1-logloss:0.320315
[6]	validation_0-logloss:0.285513	validation_1-logloss:0.285719
[7]	validation_0-logloss:0.255379	validation_1-logloss:0.255597
[8]	validation_0-logloss:0.228997	validation_1-logloss:0.229227
[9]	validation_0-logloss:0.205821	validation_1-logloss:0.206062
[10]	validation_0-logloss:0.185744	validation_1-logloss:0.186017
[11]	validation_0-logloss:0.167655	validation_1-logloss:0.167938
[12]	validation_0-logloss:0.151631	validation_1-

 50%|█████     | 12/24 [1:32:07<1:21:01, 405.16s/it]

ind_ecue_fin_ult1
[0]	validation_0-logloss:0.599173	validation_1-logloss:0.599224
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.522211	validation_1-logloss:0.522308
[2]	validation_0-logloss:0.45806	validation_1-logloss:0.458201
[3]	validation_0-logloss:0.403853	validation_1-logloss:0.404039
[4]	validation_0-logloss:0.357789	validation_1-logloss:0.357987
[5]	validation_0-logloss:0.317892	validation_1-logloss:0.318131
[6]	validation_0-logloss:0.28333	validation_1-logloss:0.283611
[7]	validation_0-logloss:0.253332	validation_1-logloss:0.253624
[8]	validation_0-logloss:0.227153	validation_1-logloss:0.227469
[9]	validation_0-logloss:0.203917	validation_1-logloss:0.204271
[10]	validation_0-logloss:0.183433	validation_1-logloss:0.183827
[11]	validation_0-logloss:0.165312	validation_1-logloss:0.16574
[12]	validation_0-logloss:0.149264	validation_1-lo

 54%|█████▍    | 13/24 [1:39:47<1:17:18, 421.64s/it]

ind_fond_fin_ult1
[0]	validation_0-logloss:0.598452	validation_1-logloss:0.598441
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.521176	validation_1-logloss:0.521156
[2]	validation_0-logloss:0.456467	validation_1-logloss:0.456438
[3]	validation_0-logloss:0.401783	validation_1-logloss:0.401744
[4]	validation_0-logloss:0.355276	validation_1-logloss:0.355229
[5]	validation_0-logloss:0.315026	validation_1-logloss:0.314972
[6]	validation_0-logloss:0.280125	validation_1-logloss:0.280062
[7]	validation_0-logloss:0.249698	validation_1-logloss:0.249627
[8]	validation_0-logloss:0.22304	validation_1-logloss:0.222961
[9]	validation_0-logloss:0.199727	validation_1-logloss:0.199641
[10]	validation_0-logloss:0.179023	validation_1-logloss:0.17893
[11]	validation_0-logloss:0.160702	validation_1-logloss:0.160602
[12]	validation_0-logloss:0.144452	validation_1-l

 58%|█████▊    | 14/24 [1:42:38<57:43, 346.34s/it]  

ind_hip_fin_ult1
[0]	validation_0-logloss:0.598274	validation_1-logloss:0.598287
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520475	validation_1-logloss:0.520483
[2]	validation_0-logloss:0.455603	validation_1-logloss:0.455605
[3]	validation_0-logloss:0.400768	validation_1-logloss:0.400764
[4]	validation_0-logloss:0.353929	validation_1-logloss:0.35392
[5]	validation_0-logloss:0.313635	validation_1-logloss:0.313632
[6]	validation_0-logloss:0.278633	validation_1-logloss:0.278625
[7]	validation_0-logloss:0.248106	validation_1-logloss:0.248092
[8]	validation_0-logloss:0.221355	validation_1-logloss:0.221338
[9]	validation_0-logloss:0.197821	validation_1-logloss:0.197798
[10]	validation_0-logloss:0.177077	validation_1-logloss:0.177057
[11]	validation_0-logloss:0.158715	validation_1-logloss:0.158698
[12]	validation_0-logloss:0.142388	validation_1-l

 62%|██████▎   | 15/24 [1:44:56<42:36, 284.07s/it]

ind_plan_fin_ult1
[0]	validation_0-logloss:0.598162	validation_1-logloss:0.598172
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520373	validation_1-logloss:0.52039
[2]	validation_0-logloss:0.455505	validation_1-logloss:0.45553
[3]	validation_0-logloss:0.400673	validation_1-logloss:0.400705
[4]	validation_0-logloss:0.353834	validation_1-logloss:0.353873
[5]	validation_0-logloss:0.313485	validation_1-logloss:0.313531
[6]	validation_0-logloss:0.278491	validation_1-logloss:0.278543
[7]	validation_0-logloss:0.247979	validation_1-logloss:0.248035
[8]	validation_0-logloss:0.221233	validation_1-logloss:0.221296
[9]	validation_0-logloss:0.197703	validation_1-logloss:0.197771
[10]	validation_0-logloss:0.176939	validation_1-logloss:0.177011
[11]	validation_0-logloss:0.158549	validation_1-logloss:0.158627
[12]	validation_0-logloss:0.142228	validation_1-l

 67%|██████▋   | 16/24 [1:47:33<32:45, 245.71s/it]

ind_pres_fin_ult1
[0]	validation_0-logloss:0.598177	validation_1-logloss:0.598181
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520398	validation_1-logloss:0.520407
[2]	validation_0-logloss:0.455539	validation_1-logloss:0.455552
[3]	validation_0-logloss:0.400716	validation_1-logloss:0.400732
[4]	validation_0-logloss:0.354434	validation_1-logloss:0.354454
[5]	validation_0-logloss:0.314015	validation_1-logloss:0.314037
[6]	validation_0-logloss:0.279464	validation_1-logloss:0.279488
[7]	validation_0-logloss:0.248826	validation_1-logloss:0.248852
[8]	validation_0-logloss:0.222457	validation_1-logloss:0.222489
[9]	validation_0-logloss:0.198784	validation_1-logloss:0.198817
[10]	validation_0-logloss:0.178331	validation_1-logloss:0.178367
[11]	validation_0-logloss:0.159783	validation_1-logloss:0.159821
[12]	validation_0-logloss:0.143726	validation_1

 71%|███████   | 17/24 [1:51:04<27:26, 235.24s/it]

ind_reca_fin_ult1
[0]	validation_0-logloss:0.599839	validation_1-logloss:0.599838
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.523436	validation_1-logloss:0.523434
[2]	validation_0-logloss:0.459755	validation_1-logloss:0.459753
[3]	validation_0-logloss:0.405962	validation_1-logloss:0.40596
[4]	validation_0-logloss:0.360052	validation_1-logloss:0.36005
[5]	validation_0-logloss:0.320546	validation_1-logloss:0.320545
[6]	validation_0-logloss:0.286316	validation_1-logloss:0.286318
[7]	validation_0-logloss:0.256506	validation_1-logloss:0.25651
[8]	validation_0-logloss:0.230421	validation_1-logloss:0.230424
[9]	validation_0-logloss:0.207566	validation_1-logloss:0.207578
[10]	validation_0-logloss:0.187407	validation_1-logloss:0.187428
[11]	validation_0-logloss:0.169564	validation_1-logloss:0.169586
[12]	validation_0-logloss:0.153747	validation_1-lo

 75%|███████▌  | 18/24 [2:01:32<35:18, 353.14s/it]

ind_tjcr_fin_ult1
[0]	validation_0-logloss:0.602333	validation_1-logloss:0.602307
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.527939	validation_1-logloss:0.527893
[2]	validation_0-logloss:0.466556	validation_1-logloss:0.466433
[3]	validation_0-logloss:0.414065	validation_1-logloss:0.413947
[4]	validation_0-logloss:0.369291	validation_1-logloss:0.369169
[5]	validation_0-logloss:0.330783	validation_1-logloss:0.330659
[6]	validation_0-logloss:0.29741	validation_1-logloss:0.297289
[7]	validation_0-logloss:0.268382	validation_1-logloss:0.268252
[8]	validation_0-logloss:0.242998	validation_1-logloss:0.242859
[9]	validation_0-logloss:0.220705	validation_1-logloss:0.220568
[10]	validation_0-logloss:0.201106	validation_1-logloss:0.200962
[11]	validation_0-logloss:0.183772	validation_1-logloss:0.183627
[12]	validation_0-logloss:0.168611	validation_1-

 79%|███████▉  | 19/24 [2:09:05<31:56, 383.22s/it]

ind_valo_fin_ult1
[0]	validation_0-logloss:0.598487	validation_1-logloss:0.598507
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.521077	validation_1-logloss:0.521097
[2]	validation_0-logloss:0.456532	validation_1-logloss:0.456552
[3]	validation_0-logloss:0.401859	validation_1-logloss:0.401898
[4]	validation_0-logloss:0.355161	validation_1-logloss:0.355218
[5]	validation_0-logloss:0.315037	validation_1-logloss:0.315096
[6]	validation_0-logloss:0.280154	validation_1-logloss:0.28023
[7]	validation_0-logloss:0.249825	validation_1-logloss:0.249904
[8]	validation_0-logloss:0.223167	validation_1-logloss:0.223262
[9]	validation_0-logloss:0.19973	validation_1-logloss:0.199841
[10]	validation_0-logloss:0.179049	validation_1-logloss:0.179174
[11]	validation_0-logloss:0.160745	validation_1-logloss:0.160885
[12]	validation_0-logloss:0.144509	validation_1-l

 83%|████████▎ | 20/24 [2:11:32<20:49, 312.34s/it]

ind_viv_fin_ult1
[0]	validation_0-logloss:0.598154	validation_1-logloss:0.598153
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.520358	validation_1-logloss:0.520356
[2]	validation_0-logloss:0.455483	validation_1-logloss:0.455481
[3]	validation_0-logloss:0.400646	validation_1-logloss:0.400643
[4]	validation_0-logloss:0.353803	validation_1-logloss:0.353799
[5]	validation_0-logloss:0.313452	validation_1-logloss:0.313447
[6]	validation_0-logloss:0.278454	validation_1-logloss:0.278449
[7]	validation_0-logloss:0.247929	validation_1-logloss:0.247924
[8]	validation_0-logloss:0.22118	validation_1-logloss:0.221174
[9]	validation_0-logloss:0.197647	validation_1-logloss:0.197641
[10]	validation_0-logloss:0.176873	validation_1-logloss:0.176866
[11]	validation_0-logloss:0.158843	validation_1-logloss:0.158835
[12]	validation_0-logloss:0.142479	validation_1-l

 88%|████████▊ | 21/24 [2:14:04<13:12, 264.22s/it]

ind_nomina_ult1
[0]	validation_0-logloss:0.60162	validation_1-logloss:0.601663
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.52668	validation_1-logloss:0.526764
[2]	validation_0-logloss:0.464381	validation_1-logloss:0.46451
[3]	validation_0-logloss:0.411852	validation_1-logloss:0.411992
[4]	validation_0-logloss:0.366783	validation_1-logloss:0.366964
[5]	validation_0-logloss:0.328311	validation_1-logloss:0.32854
[6]	validation_0-logloss:0.29476	validation_1-logloss:0.295007
[7]	validation_0-logloss:0.265491	validation_1-logloss:0.265766
[8]	validation_0-logloss:0.239898	validation_1-logloss:0.240205
[9]	validation_0-logloss:0.217509	validation_1-logloss:0.217851
[10]	validation_0-logloss:0.197814	validation_1-logloss:0.198169
[11]	validation_0-logloss:0.180429	validation_1-logloss:0.180796
[12]	validation_0-logloss:0.165018	validation_1-loglos

 92%|█████████▏| 22/24 [2:24:48<12:36, 378.09s/it]

ind_nom_pens_ult1
[0]	validation_0-logloss:0.601636	validation_1-logloss:0.601691
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.526726	validation_1-logloss:0.526825
[2]	validation_0-logloss:0.464726	validation_1-logloss:0.46485
[3]	validation_0-logloss:0.411991	validation_1-logloss:0.412149
[4]	validation_0-logloss:0.36696	validation_1-logloss:0.367146
[5]	validation_0-logloss:0.328521	validation_1-logloss:0.328728
[6]	validation_0-logloss:0.294968	validation_1-logloss:0.295201
[7]	validation_0-logloss:0.265741	validation_1-logloss:0.265997
[8]	validation_0-logloss:0.2402	validation_1-logloss:0.240475
[9]	validation_0-logloss:0.217783	validation_1-logloss:0.218074
[10]	validation_0-logloss:0.198071	validation_1-logloss:0.198378
[11]	validation_0-logloss:0.180686	validation_1-logloss:0.181008
[12]	validation_0-logloss:0.1653	validation_1-loglo

 96%|█████████▌| 23/24 [2:39:39<08:52, 532.14s/it]

ind_recibo_ult1
[0]	validation_0-logloss:0.604993	validation_1-logloss:0.60497
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.533617	validation_1-logloss:0.533532
[2]	validation_0-logloss:0.47335	validation_1-logloss:0.47326
[3]	validation_0-logloss:0.422493	validation_1-logloss:0.422395
[4]	validation_0-logloss:0.379158	validation_1-logloss:0.379048
[5]	validation_0-logloss:0.341949	validation_1-logloss:0.341846
[6]	validation_0-logloss:0.309957	validation_1-logloss:0.309871
[7]	validation_0-logloss:0.28196	validation_1-logloss:0.281882
[8]	validation_0-logloss:0.257527	validation_1-logloss:0.257454
[9]	validation_0-logloss:0.236125	validation_1-logloss:0.236055
[10]	validation_0-logloss:0.217258	validation_1-logloss:0.217182
[11]	validation_0-logloss:0.201099	validation_1-logloss:0.201014
[12]	validation_0-logloss:0.186587	validation_1-loglo

100%|██████████| 24/24 [2:55:41<00:00, 439.25s/it]


## 提交结果

In [14]:
# 提交结果：选取（加权？）概率最大(top7?)的新产品
pre_month_col=['1_'+x for x in target_raw_cols] # 上个月的产品购买记录列
pre_month_product=X_TRAIN[["ncodpers"]+pre_month_col]
result=pd.merge(result, pre_month_product, on=['ncodpers'], how='left') #当月产品购买概率预测+上个月产品购买记录

def func(x): #0->1;1->0
  if int(x)==0:
    return 1
  else:
    return 0

def drop_old_product(result,target_raw_cols): # 将上个月购买过的产品购买概率记为0
  for col in target_raw_cols:
    result['1_'+col]=result[col].apply(func) 
    result[col]=result[col]*result['1_'+col]
  return result

result=drop_old_product(result,target_raw_cols)

added_products_array=np.array(result[target_raw_cols])

target_raw_array = np.array(target_raw_cols)    # list->array
added_products_array = np.argsort(added_products_array, axis = 1) # 每一行排序（从小到大）
added_products_array = np.fliplr(added_products_array)[:, :7] # 反转排序结果，选取top7
added_products = [" ".join(list(target_raw_array[prod])) for prod in added_products_array] # 以空格连接这7项产品

result["added_products"]=added_products
result=result.drop(target_raw_cols+pre_month_col, axis=1) # 丢弃上个月的购买历史记录及当月购买产品概率预测

print(result.head())
print(result.tail())
# 保存提交结果
result.to_csv('./result.csv', index=False)

   ncodpers                                     added_products
0     15889  ind_cco_fin_ult1 ind_ctpp_fin_ult1 ind_valo_fi...
1   1170544  ind_cco_fin_ult1 ind_recibo_ult1 ind_reca_fin_...
2   1170545  ind_cco_fin_ult1 ind_recibo_ult1 ind_nomina_ul...
3   1170547  ind_cco_fin_ult1 ind_nom_pens_ult1 ind_nomina_...
4   1170548  ind_cco_fin_ult1 ind_nom_pens_ult1 ind_nomina_...
        ncodpers                                     added_products
929610    660237  ind_ecue_fin_ult1 ind_cno_fin_ult1 ind_recibo_...
929611    660238  ind_cco_fin_ult1 ind_pres_fin_ult1 ind_reca_fi...
929612    660240  ind_cco_fin_ult1 ind_ctop_fin_ult1 ind_deme_fi...
929613    660243  ind_cco_fin_ult1 ind_reca_fin_ult1 ind_viv_fin...
929614    660248  ind_ctop_fin_ult1 ind_cco_fin_ult1 ind_pres_fi...
