In [1]:
# 模型训练数据
# 1. 用户侧：用户画像属性（用户性别、年龄、所在省市、使用设备及系统）
# 2. 新闻侧：新闻的创建时间、题目、所属一级、二级类别，题片个数以及关键词
def proccess(file):
 if file=="user_info_data_5w.csv":
     data = pd.read_csv(file_path + file, sep="\t",index_col=0)
     data["age"] = data["age"].map(lambda x: get_pro_age(x))
     data["gender"] = data["gender"].map(lambda x: get_pro_age(x))

     data["province"]=data["province"].fillna(method='ffill')
     data["city"]=data["city"].fillna(method='ffill')

     data["device"] = data["device"].fillna(method='ffill')
     data["os"] = data["os"].fillna(method='ffill')
     return data

 elif file=="doc_info.txt":
     data = pd.read_csv(file_path + file, sep="\t")
     data.columns = ["article_id", "title", "ctime", "img_num","cate","sub_cate", "key_words"]
     select_column = ["article_id", "title_len", "ctime", "img_num","cate","sub_cate", "key_words"]

     # 去除时间为nan的新闻以及除脏数据
     data= data[(data["ctime"].notna()) & (data["ctime"] != 'Android')]
     data['ctime'] = data['ctime'].astype('str')
     data['ctime'] = data['ctime'].apply(lambda x: int(x[:10]))
     data['ctime'] = pd.to_datetime(data['ctime'], unit='s', errors='coerce')


     # 这里存在nan字符串和异常数据
     data["sub_cate"] = data["sub_cate"].astype(str)
     data["sub_cate"] = data["sub_cate"].apply(lambda x: pro_sub_cate(x))
     data["img_num"] = data["img_num"].astype(str)
     data["img_num"] = data["img_num"].apply(photoNums)
     data["title_len"] = data["title"].apply(lambda x: len(x) if isinstance(x, str) else 0)
     data["cate"] = data["cate"].fillna('其他')

     return data[select_column]

In [2]:
# 构造训练样本
# 根据用户的交互日志中前六天的数据作为训练集，第七天的数据作为测试集，来构造模型的训练测试样本
def dealsample(file, doc_data, user_data, s_data_str = "2021-06-24 00:00:00", e_data_str="2021-06-30 23:59:59", neg_num=5):
 # 先处理时间问题
    data = pd.read_csv(file_path + file, sep="\t",index_col=0)
    data['expo_time'] = data['expo_time'].astype('str')
    data['expo_time'] = data['expo_time'].apply(lambda x: int(x[:10]))
    data['expo_time'] = pd.to_datetime(data['expo_time'], unit='s', errors='coerce')

    s_date = datetime.datetime.strptime(s_data_str,"%Y-%m-%d %H:%M:%S")
    e_date = datetime.datetime.strptime(e_data_str,"%Y-%m-%d %H:%M:%S") + datetime.timedelta(days=-1)
    t_date = datetime.datetime.strptime(e_data_str,"%Y-%m-%d %H:%M:%S")

 # 选取训练和测试所需的数据
 all_data_tmp = data[(data["expo_time"]>=s_date) & (data["expo_time"]<=t_date)]

 # 处理训练数据集  防止穿越样本
 # 1. merge 新闻信息，得到曝光时间和新闻创建时间； inner join 去除doc_data之外的新闻
 all_data_tmp = all_data_tmp.join(doc_data.set_index("article_id"),on="article_id",how='inner')

 # 发现还存在 ctime大于expo_time的交互存在  去除这部分错误数据
 all_data_tmp = all_data_tmp[(all_data_tmp["ctime"]<=all_data_tmp["expo_time"])]

 # 2. 去除与新闻的创建时间在测试数据时间内的交互  ()
 train_data = all_data_tmp[(all_data_tmp["expo_time"]>=s_date) & (all_data_tmp["expo_time"]<=e_date)]
 train_data = train_data[(train_data["ctime"]<=e_date)]

 print("有效的样本数：",train_data["expo_time"].count())

 # 负采样
 if os.path.exists(file_path + "neg_sample.pkl") and os.path.getsize(file_path + "neg_sample.pkl"):
     neg_samples = pd.read_pickle(file_path + "neg_sample.pkl")
     # train_neg_samples.insert(loc=2, column="click", value=[0] * train_neg_samples["user_id"].count())
 else:
     # 进行负采样的时候对于样本进行限制，只对一定时间范围之内的样本进行负采样
     doc_data_tmp = doc_data[(doc_data["ctime"]>=datetime.datetime.strptime("2021-06-01 00:00:00","%Y-%m-%d %H:%M:%S"))]
     neg_samples = negSample_like_word2vec(train_data, doc_data_tmp[["article_id"]].values, user_data[["user_id"]].values, neg_num=neg_num)
     neg_samples = pd.DataFrame(neg_samples, columns= ["user_id","article_id","click"])
     neg_samples.to_pickle(file_path + "neg_sample.pkl")

 train_pos_samples = train_data[train_data["click"] == 1][["user_id","article_id", "expo_time", "click"]]    # 取正样本

 neg_samples_df = train_data[train_data["click"] == 0][["user_id","article_id", "click"]]
 train_neg_samples = pd.concat([neg_samples_df.sample(n=train_pos_samples["click"].count()) ,neg_samples],axis=0)  # 取负样本

 print("训练集正样本数：",train_pos_samples["click"].count())
 print("训练集负样本数：",train_neg_samples["click"].count())

 train_data_df = pd.concat([train_neg_samples,train_pos_samples],axis=0)
 train_data_df = train_data_df.sample(frac=1)  # shuffle

 print("训练集总样本数：",train_data_df["click"].count())

 test_data_df =  all_data_tmp[(all_data_tmp["expo_time"]>e_date) & (all_data_tmp["expo_time"]<=t_date)][["user_id","article_id", "expo_time", "click"]]

 print("测试集总样本数：",test_data_df["click"].count())
 print("测试集总样本数：",test_data_df["click"].count())

 all_data_df =  pd.concat([train_data_df, test_data_df],axis=0)

 print("总样本数：",all_data_df["click"].count())

 return all_data_df
