In [38]:
import gc
import os
import time

import pandas as pd
from functools import reduce
import json
import logging

data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
output_path = os.path.join(os.path.dirname(os.getcwd()), 'dataset')
if not os.path.exists(data_path):
    os.mkdir(data_path)
if not os.path.exists(output_path):
    os.mkdir(output_path)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("interlist_final")
handler1 = logging.FileHandler("base-log.log")
handler1.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s|%(name)-12s+ %(levelname)-8s++%(message)s')
handler1.setFormatter(formatter)
logger.addHandler(handler1)

logger.info(data_path)
logger.info(output_path)
user_act = pd.read_csv(os.path.join(data_path, 'user_action.csv'), sep=',', error_bad_lines=False, encoding='utf-8',
                       header=0)
df1 = user_act.drop(labels=['id',
                            'device_id', 'os', 'os_version', 'version', 'system', 'platform', 'pg_short_url',
                            'log_time', 'cal_dt', 'duration', 'log_id'], axis=1)
df1.loc[((df1.event_id == 254) | (df1.event_id == 248)), 'follow'] = 'thumb'
df1.loc[((df1.event_id == 257) | (df1.event_id == 249)), 'follow'] = 'comment'
df1.loc[((df1.event_id == 258) | (df1.event_id == 256)), 'follow'] = 'forward'
df1.loc[((df1.event_id == 262) | (df1.event_id == 263)), 'follow'] = 'detail'
df1.loc[((df1.event_id == 264) | (df1.event_id == 310)), 'follow'] = 'detail'
df1['follow'] = df1['follow'].fillna(0)
# user_new['follow'] = user_new['follow'].fillna(0)
df1['follow'] = df1['follow'].replace(0, 'neg')
df1['follow'].value_counts()

# df1['content_id'] = df1.loc[(df1.event_id != 221), 'event_data']
df1['event_data'] = df1['event_data'].apply(json.loads)


def process(event_data):
    if not isinstance(event_data, dict):
        # print('list')
        return 0
    if 'content_id' in event_data.keys():
        return event_data['content_id']


df1['event_data'] = df1['event_data'].apply(process)
df1['event_data'] = pd.to_numeric(df1['event_data']).fillna('0').astype('int64')
df1['event_data'].value_counts()
df1['content_id'] = df1['event_data']

df1 = df1.sort_values(by=['user_id', 'created_at'], ascending=True)

gbr = df1.groupby('user_id')  # 用分组函数groupby()进行数据的分组，分组依据为'TYPE'这一属性

df_neg = df1[df1['follow'] == 'neg']
df_pos = df1[df1['follow'] != 'neg']

logger.info('-' * 5 + 'process sample' + '-' * 5)

df_pos.drop_duplicates(subset=['user_id', 'content_id', 'follow'], keep='last', inplace=True)
df_neg.drop_duplicates(subset=['user_id', 'content_id'], keep='last', inplace=True)
df_dic = df_neg['user_id'].value_counts().to_dict()

ratio = len(df_pos) / len(df_neg)
df_sample = dict()
sum = 0
for key, value in df_dic.items():
    v = round(value * ratio)
    df_sample[key] = v
    sum += v


def typicalsamling(group, typicalNDict):
    name = group.name
    n = typicalNDict[name]
    return group.sample(n=n)


df_neg_sample = df_neg.groupby('user_id').apply(typicalsamling, df_sample)

frames = [df_pos, df_neg_sample]
result = pd.concat(frames)
result = result.sort_values(by=['user_id', 'created_at'], ascending=True)

2022-03-05 21:20:15,078 - interlist_final - INFO - /Users/starry/Documents/Code/ai/KYRS/data
2022-03-05 21:20:15,079 - interlist_final - INFO - /Users/starry/Documents/Code/ai/KYRS/dataset


  exec(code_obj, self.user_global_ns, self.user_ns)
2022-03-05 21:21:41,971 - interlist_final - INFO - -----process sample-----
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [39]:

content = pd.read_csv(os.path.join(data_path, '帖子.csv'), sep=',', error_bad_lines=False, encoding='utf-8', header=0)
topic = pd.read_csv(os.path.join(data_path, '话题.csv'), sep=',', error_bad_lines=False, encoding='utf-8', header=0)
content.rename(columns={'subject_id': 'topic_id', 'id': 'content_id'}, inplace=True)
content = content[['content_id', 'topic_id', 'desc']]
topic = topic[['topic_id', 'genre_id', 'introduction']]
add_content = pd.merge(result, content, on='content_id', how='left')
add_topic = pd.merge(add_content, topic, on='topic_id', how='left')

df_final = add_topic.join(pd.get_dummies(add_topic.follow))
del add_content, add_topic, content, topic, df_neg_sample, df_neg, df_pos, result
gc.collect()
df_final = df_final[
    ['user_id', 'topic_id', 'desc', 'genre_id', 'introduction', 'comment', 'forward', 'thumb', 'detail', 'neg',
     'follow', 'created_at']]

df_final['rep'] = df_final[df_final['follow'] != 'neg'].duplicated(subset=['user_id', 'desc'], keep=False)

df_final['rep'] = df_final['rep'].fillna(False)


In [40]:


def de_duplicate_comment(flag, user_id, desc, comment):
    if flag is True:
        comment = df_final.loc[((df_final.user_id == user_id) & (df_final.desc == desc)), 'comment'].sum()
        return int(min(comment, 1))
    else:
        return comment


def de_duplicate_forward(flag, user_id, desc, forward):
    if flag is True:
        forward = df_final.loc[((df_final.user_id == user_id) & (df_final.desc == desc)), 'forward'].sum()
        return int(min(forward, 1))
    else:
        return forward


def de_duplicate_thumb(flag, user_id, desc, thumb):
    if flag is True:
        thumb = df_final.loc[((df_final.user_id == user_id) & (df_final.desc == desc)), 'thumb'].sum()
        return int(min(thumb, 1))
    else:
        return thumb


def de_duplicate_detail(flag, user_id, desc, detail):
    if flag is True:
        detail = df_final.loc[((df_final.user_id == user_id) & (df_final.desc == desc)), 'detail'].sum()
        return int(min(detail, 1))
    else:
        return detail


logger.info('-' * 5 + 'process comment' + '-' * 5)
df_final = df_final.head(500)
df_final

2022-03-05 21:22:06,772 - interlist_final - INFO - -----process comment-----


Unnamed: 0,user_id,topic_id,desc,genre_id,introduction,comment,forward,thumb,detail,neg,follow,created_at,rep
0,0,0.0,如何应对“难搞”的人？\n\n回忆一下，你生命中遇到过的那些称得上是“难搞”的人，身上是不是...,,,1,0,0,0,0,comment,1635990068272,True
1,0,0.0,"Stop shout out the window, please. I am enough...",,,0,0,0,0,1,neg,1635998426910,False
2,0,1014.0,打开了新世界的大门，果然很解压😂\n先戳左半边，再戳右半边，美图xx拼图再贴纸描边（但只能贴...,33.0,“ 面对压力，你可以尝试读书、冥想、绘画、唱歌、聊天， 以及，有着迷之乐趣的——捏泡泡！”\...,0,0,0,1,0,detail,1636006093810,False
3,0,1014.0,小兔竟然get了跟我一样的点！\n我之前闲的无聊的时候就用戳泡泡当像素画来用，感觉跟玩拼图的...,33.0,“ 面对压力，你可以尝试读书、冥想、绘画、唱歌、聊天， 以及，有着迷之乐趣的——捏泡泡！”\...,0,0,0,1,0,detail,1636006096673,False
4,0,0.0,当知道他有了新恋情，感觉松了一口气\n以为是不再在乎了\n但昨天梦见他了，却有点不想醒过来\...,,,0,0,0,0,1,neg,1636011117859,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0.0,同事：你有过心动的男生吗？\n我：当然有过，不过后来就不觉得心动了。\n同事：那当时心动的原...,,,0,0,0,1,0,detail,1637101070209,False
496,0,0.0,麻了 麻的非常彻底\n其实是为了纪念一下我开天辟地第一回晚上放学回家能写的进去点作业的时候）...,,,0,0,0,0,1,neg,1637101316641,False
497,0,1024.0,1. 给月季花们进行第二次打药\n2. 周末前找到小伙伴一起玩剧本杀\n3. 本周阅读完红杉...,24.0,“有些事取决于你，有些事则不然。”\n\n失控往往带来令人暴躁的结果，活着的终极目标是「更好...,0,0,0,1,0,detail,1637102024261,False
498,0,1006.0,作为毛茸茸本茸，小兔当仁不让的来为大家介绍本期【一起翻翻乐】的共练活动啦！本期主题：摸摸身边...,33.0,“毛茸茸真是世界上最最最治愈的存在啦！”\n\n也许是一只可爱的小猫咪，一个毛绒玩偶熊，一条...,0,0,0,1,0,detail,1637102046342,False


In [41]:
df_final['comment'] = df_final.apply(
    lambda row: de_duplicate_comment(row['rep'], row['user_id'], row['desc'], row['comment']), axis=1)
logger.info('-' * 5 + 'process forward' + '-' * 5)

df_final['forward'] = df_final.apply(
    lambda row: de_duplicate_forward(row['rep'], row['user_id'], row['desc'], row['forward']), axis=1)
logger.info('-' * 5 + 'process thumb' + '-' * 5)

df_final['thumb'] = df_final.apply(
    lambda row: de_duplicate_thumb(row['rep'], row['user_id'], row['desc'], row['thumb']), axis=1)
logger.info('-' * 5 + 'process detail' + '-' * 5)

df_final['detail'] = df_final.apply(
    lambda row: de_duplicate_detail(row['rep'], row['user_id'], row['desc'], row['detail']), axis=1)

2022-03-05 21:22:06,821 - interlist_final - INFO - -----process forward-----
2022-03-05 21:22:06,850 - interlist_final - INFO - -----process thumb-----
2022-03-05 21:22:06,876 - interlist_final - INFO - -----process detail-----


In [42]:
df_final['detail'] = df_final['detail'].astype('int64')
df_final['comment'] = df_final['comment'].astype('int64')
df_final['forward'] = df_final['forward'].astype('int64')
df_final['thumb'] = df_final['thumb'].astype('int64')
df_final

Unnamed: 0,user_id,topic_id,desc,genre_id,introduction,comment,forward,thumb,detail,neg,follow,created_at,rep
0,0,0.0,如何应对“难搞”的人？\n\n回忆一下，你生命中遇到过的那些称得上是“难搞”的人，身上是不是...,,,1,0,0,1,0,comment,1635990068272,True
1,0,0.0,"Stop shout out the window, please. I am enough...",,,0,0,0,0,1,neg,1635998426910,False
2,0,1014.0,打开了新世界的大门，果然很解压😂\n先戳左半边，再戳右半边，美图xx拼图再贴纸描边（但只能贴...,33.0,“ 面对压力，你可以尝试读书、冥想、绘画、唱歌、聊天， 以及，有着迷之乐趣的——捏泡泡！”\...,0,0,0,1,0,detail,1636006093810,False
3,0,1014.0,小兔竟然get了跟我一样的点！\n我之前闲的无聊的时候就用戳泡泡当像素画来用，感觉跟玩拼图的...,33.0,“ 面对压力，你可以尝试读书、冥想、绘画、唱歌、聊天， 以及，有着迷之乐趣的——捏泡泡！”\...,0,0,0,1,0,detail,1636006096673,False
4,0,0.0,当知道他有了新恋情，感觉松了一口气\n以为是不再在乎了\n但昨天梦见他了，却有点不想醒过来\...,,,0,0,0,0,1,neg,1636011117859,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0.0,同事：你有过心动的男生吗？\n我：当然有过，不过后来就不觉得心动了。\n同事：那当时心动的原...,,,0,0,0,1,0,detail,1637101070209,False
496,0,0.0,麻了 麻的非常彻底\n其实是为了纪念一下我开天辟地第一回晚上放学回家能写的进去点作业的时候）...,,,0,0,0,0,1,neg,1637101316641,False
497,0,1024.0,1. 给月季花们进行第二次打药\n2. 周末前找到小伙伴一起玩剧本杀\n3. 本周阅读完红杉...,24.0,“有些事取决于你，有些事则不然。”\n\n失控往往带来令人暴躁的结果，活着的终极目标是「更好...,0,0,0,1,0,detail,1637102024261,False
498,0,1006.0,作为毛茸茸本茸，小兔当仁不让的来为大家介绍本期【一起翻翻乐】的共练活动啦！本期主题：摸摸身边...,33.0,“毛茸茸真是世界上最最最治愈的存在啦！”\n\n也许是一只可爱的小猫咪，一个毛绒玩偶熊，一条...,0,0,0,1,0,detail,1637102046342,False
