In [1]:
import numpy as np
import pandas as pd
from mlaas_tools2.db_tool import DatabaseConnections
from mlaas_tools2.config_info import ConfigPass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import DateType, StructType, StructField, StringType, LongType, DoubleType
from pyspark.sql.functions import create_map, lit
from pyspark.sql.functions import expr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime,  timedelta
import os
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)

# DB連線

In [2]:
# from mlaas_tools.config_build import config_set
# from mlaas_tools.config_info import ConfigPass
# from mlaas_tools.db_tool import DatabaseConnections
# # config_set()
# configs = ConfigPass()._configsection
# conns = DatabaseConnections(configs)
# # rawdata_conn = conns.get_rawdata_db_conn()
# # feature_conn = conns.get_feature_db_conn()
# raw_conn = conns.get_rawdata_db_conn()
# raw_cur = raw_conn.cursor()
# ftr_conn = conns.get_feature_db_conn()
# ftr_cur = ftr_conn.cursor()

# Spark 連線

In [3]:
default = {"spark.driver.memory":'16g',
           "fs.s3a.access.key": "smartchannel",
           "fs.s3a.secret.key": "smartchannel",
           "fs.s3a.endpoint": "http://10.240.205.23:9000",
           "fs.s3a.connection.ssl.enabled": False,
           "fs.s3a.path.style.access": True,} 

spark = SparkSession.builder.config(
    conf = (SparkConf().setAppName("T").setAll(default.items()))).getOrCreate()

spark

In [None]:
spark

# 從feature group 以及 layer1 讀資料

In [5]:
train_start_date = '20211217'
train_end_date = '20211225'
test_date = '20211226'
# test_date = '20220131'

In [4]:
train_start_date = '20211217'
train_end_date = '20211226'
test_date = '20211227'
# test_date = '20220131'

In [26]:
train_start_date = '20220227'
train_end_date = '20220227'
test_date = '20220228'
# test_date = '20220131'

## 1. item-subtag (sc_item)

In [17]:
item_sdf = (
    spark.read.parquet(
        "s3a://df-smart-channel/recsys-dataset/beta_v4/layer1/sc_item"
    ).where(col('service')=='smart_channel').
    where(
        (col("date")>=train_start_date) & (col("date")<=test_date)
    )
    # .select(['item_id', 'date','subtag_list'])
)

In [18]:
item_subtag_exploded = item_sdf.select(item_sdf.item_id, item_sdf.date,  F.explode(item_sdf.subtag_list))

In [19]:
item_subtag_details = item_subtag_exploded.select(['item_id', 'date', 'col.subtag' ,'col.subtag_chinese_desc','col.subtag_eng_desc']).filter(item_subtag_exploded['col.subtag_chinese_desc'].startswith('03_'))

In [20]:
item_subtag_details.show(5)

+--------------------+--------+--------------------+--------------------+------------------+
|             item_id|    date|              subtag| subtag_chinese_desc|   subtag_eng_desc|
+--------------------+--------+--------------------+--------------------+------------------+
|8236E47D-EC12-4B4...|20220319|{D4B0AD09-7BF0-49...|03_理財意圖[360標籤]|03_wealth_view_ind|
|8236E47D-EC12-4B4...|20220319|{198F529D-0928-4B...|03_投資意圖[360標籤]| 03_investment_ind|
|8236E47D-EC12-4B4...|20220319|{31E61F5A-CCDC-43...|           03_純存戶|   03_account_only|
|B105ECE3-B0EA-4F0...|20220319|{270BB666-1CC2-4F...|     03_行銀活躍用戶|  03_mobile_active|
|B105ECE3-B0EA-4F0...|20220319|{D4B0AD09-7BF0-49...|03_理財意圖[360標籤]|03_wealth_view_ind|
+--------------------+--------+--------------------+--------------------+------------------+
only showing top 5 rows



In [21]:
item_subtag_details.select('date').distinct().collect()

[Row(date=20220320), Row(date=20220319)]

In [9]:
item_subtag_net = item_subtag_details.select(['item_id', 'date', 'subtag_eng_desc'])

In [10]:
item_subtag_net.show(5)

+--------------------+--------+------------------+
|             item_id|    date|   subtag_eng_desc|
+--------------------+--------+------------------+
|8236E47D-EC12-4B4...|20211219|03_wealth_view_ind|
|8236E47D-EC12-4B4...|20211219| 03_investment_ind|
|8236E47D-EC12-4B4...|20211219|   03_account_only|
|0266753B-F0CD-460...|20211219|03_wealth_view_ind|
|0266753B-F0CD-460...|20211219|     03_no_fc_cust|
+--------------------+--------+------------------+
only showing top 5 rows



In [11]:
item_subtag_net_train = item_subtag_net.filter(item_subtag_net.date == train_end_date)
item_subtag_net_test = item_subtag_net.filter(item_subtag_net.date == test_date)

In [12]:
item_subtag_df_train = item_subtag_net_train.toPandas()
item_subtag_df_test = item_subtag_net_train.toPandas()

In [39]:
def export_file(df, test_date, file_name):
    dir_name = f'date={test_date}'
    dir_path = os.path.join(f'..//..//..//data//preprocessed//{dir_name}')
    # print(dir_path)
    if not os.path.isdir(dir_path):
    #if not os.path.isdir(dir_path):
        # print( os.path.isdir(dir_path))
        os.mkdir(dir_name)
        #os.makedirs(dir_name , exist_ok=False)
    file_path = os.path.join(dir_path, file_name)
    print(file_path)
    df.to_csv(file_path, index = False)

In [41]:
export_file(item_subtag_df_test, test_date, 'item_subtag.csv')

..//..//..//data//preprocessed//date=20211227/item_subtag.csv


### 2.user-item (context)

In [27]:
context_sdf = (
    spark.read.parquet(
        "s3a://df-smart-channel/recsys-dataset/beta_v4/layer1/context"
    ).where(
        (col("date")>=train_start_date) & (col("date")<=test_date)
    )
)

In [43]:
# context_sdf_df = (
#     spark.read.parquet(
#         "/home/jovyan/df-smart-channel/recsys-dataset/beta_v2/layer1/context"
#     ).where(
#         (col("date")>=train_start_date) & (col("date")<=test_date)
#     )
# )

In [31]:
flatten_clean_context_sdf.show(5)

+--------------------+--------+--------------------+----------------------------+-----+----+--------+----------+-------+---------+----------+-------------+-------------+------------+-------------+-------------+----------+---------+-------+----+--------+-------+--------------------+----------------+-----------+--------------+---------------+---------------------+-----------+
|             item_id|    date|             cust_no|hits_eventinfo_eventcategory|click|show|main_tag|main_image|service|parent_id|product_id|button_link_1|button_link_2|product_link|button_text_1|button_text_2|main_title|sub_title|sub_tag|type|click_30|show_30|maintag_chinese_desc|maintag_eng_desc|click_ratio|show_by_person|click_by_person|click_by_person_ratio|subtag_list|
+--------------------+--------+--------------------+----------------------------+-----+----+--------+----------+-------+---------+----------+-------------+-------------+------------+-------------+-------------+----------+---------+-------+----+--

In [33]:
flatten_clean_context_sdf.select(F.col('item_id'), F.col('cust_no'), F.col('date'),
                              F.col('click'), F.col('show')
                              )

DataFrame[item_id: string, cust_no: string, date: int, click: bigint, show: bigint]

In [34]:
# 去除沒有點擊紀錄的
clean_context_sdf = context_sdf.where(col('item_click_list').isNotNull())
print('去除沒有點擊紀錄後context的數目:', clean_context_sdf.count())
# 將item_click_list展開
clean_context_sdf = clean_context_sdf.select(clean_context_sdf.cust_no, 
                                             clean_context_sdf.date,
                                             F.explode(clean_context_sdf.item_click_list))
# 選出欄位
flatten_clean_context_sdf = clean_context_sdf.select(F.col('cust_no'), F.col('date'), F.col('col.*'))
print('展開item後context的數目:', flatten_clean_context_sdf.count())
# 選 df_smart_channel
flatten_clean_context_sdf = flatten_clean_context_sdf.where(col('hits_eventinfo_eventcategory')=='smart_channel')
print('選 df_smart_channel後context的數目:', flatten_clean_context_sdf.count())
#和item sdf合併
flatten_clean_context_sdf = flatten_clean_context_sdf.join(item_sdf.drop('click', 'show'), on=['item_id', 'date'], how='left')
# 過濾掉公版
# flatten_clean_context_sdf = flatten_clean_context_sdf.where(col('service')!='public_content')
print('過濾掉公版後context的數目:', flatten_clean_context_sdf.count())
flatten_clean_context_sdf = flatten_clean_context_sdf.select(F.col('item_id'), F.col('cust_no'), F.col('date'),
                               F.col('visitdatetime'), F.col('eventdatetime'), F.col('click'), F.col('show')
                              )
# 找出最先點擊或最先曝光(一天可能點同一個item多次)
flatten_clean_context_sdf = flatten_clean_context_sdf.withColumn("day_order", F.row_number().over(Window.partitionBy(['cust_no', 'date', 'item_id']).orderBy(flatten_clean_context_sdf['eventdatetime'])))
# 計算該顧客在當天點同一個item的點擊和曝光總數
cust_click_bydate = flatten_clean_context_sdf.groupby(['cust_no', 'date', 'item_id']).sum('click').withColumnRenamed('sum(click)', 'click')
cust_show_bydate = flatten_clean_context_sdf.groupby(['cust_no', 'date', 'item_id']).sum('show').withColumnRenamed('sum(show)', 'show')
# 選最先點擊的item
flatten_clean_context_sdf = flatten_clean_context_sdf.where(col('day_order')==1).drop('click', 'show', 'day_order')
flatten_clean_context_sdf = flatten_clean_context_sdf.join(cust_click_bydate, on=['cust_no', 'date', 'item_id'], how='left')
flatten_clean_context_sdf = flatten_clean_context_sdf.join(cust_show_bydate, on=['cust_no', 'date', 'item_id'], how='left')
# 計算點擊排名。挑出有點擊的人做排名 15674人有點擊 rank最多為2
click_sdf = flatten_clean_context_sdf.where(col('click')>0)
click_sdf = click_sdf.withColumn("rank", F.row_number().over(Window.partitionBy(['cust_no', 'date']).orderBy(click_sdf['eventdatetime'])))
flatten_clean_context_sdf = flatten_clean_context_sdf.join(click_sdf.select('click', 'cust_no', 'date', 'item_id', 'rank'), on=['click', 'cust_no', 'date', 'item_id'], how='left')
print('Total Context number:', flatten_clean_context_sdf.count())

去除沒有點擊紀錄後context的數目: 277585
展開item後context的數目: 582249
選 df_smart_channel後context的數目: 445448
過濾掉公版後context的數目: 445448


AnalysisException: Cannot resolve column name "eventdatetime" among (item_id, cust_no, date, click, show);

In [None]:
context_sdf_train = flatten_clean_context_sdf.filter(col('date') <= train_end_date)
context_sdf_test = flatten_clean_context_sdf.filter(col('date') == test_date)

In [None]:
uniqueUsersObserved = context_sdf_train.select('cust_no').distinct().collect()

In [None]:
unique_users_list = [each_user.__getitem__('cust_no') for each_user in uniqueUsersObserved]

In [None]:
# unique_users_list

In [56]:
## filter unobserved in test
context_sdf_test = context_sdf_test.filter(context_sdf_test.cust_no.isin(unique_users_list))

In [57]:
context_df_train = context_sdf_train.toPandas()
context_df_test = context_sdf_test.toPandas()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1212, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:46523)
Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/sql/dataframe.py", line 596, in collec

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:46523)

In [None]:
export_file(context_df_train, test_date, 'context_train.csv')
export_file(context_df_test, test_date, 'context_test.csv')

## 3. user-subtag (user)

In [None]:
user_sdf = (
    spark.read.parquet(
        "s3a://df-smart-channel/recsys-dataset/beta_v2/layer1/user"
    ).where(
        (col("date")>=train_end_date) & (col("date")<=test_date)
        #col("date") == train_end_date | col("date") == test_date
    )
)
user_sdf = user_sdf.drop('click', 'show')
user_sdf = user_sdf.select([
    'cust_no', 'date', 'mobile_login_90', 'dd_my', 'dd_md', 'onlymd_ind', 'onlycc_ind', 'efingo_card_ind',  'cl_cpa_amt', 'fc_ind'
])
# .filter(user_sdf.date.isin(uniqueUsersInContext))
print('User Sdf數目:', user_sdf.count())

In [None]:
unique_users_list = [each_user.__getitem__('cust_no') for each_user in uniqueUsersObserved]

In [None]:
user_sdf_sub = user_sdf.filter(user_sdf.cust_no.isin(unique_users_list))

In [None]:
# user_df_train = user_sdf.filter(user_sdf.date == train_end_date).toPandas()
user_df_test = user_sdf_sub.filter(user_sdf_sub.date == test_date).toPandas()

In [None]:
export_file(user_df_test, test_date, 'user_subtag.csv')