In [1]:
import numpy as np
import pandas as pd
import math
import os
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 200)

In [2]:
# context = pd.read_csv('..//data/preprocessed//context_1217_1230.csv')
context = pd.read_csv('/home/jovyan/df-smart-channel/graph/data/preprocessed/context_1217_0131.csv')

In [3]:
le = LabelEncoder()
context.cust_no = le.fit_transform(context.cust_no)
context.item_id = le.fit_transform(context.item_id)

In [4]:
user_list = context.cust_no.unique()
item_list = context.item_id.unique()

In [5]:
len(user_list)

576702

## Splitting

In [6]:
train_start = 20211217
train_end = 20211225
test_end = 20211226
test_date = test_end

In [7]:
## old version
# def splitting(user_list, train_start, test_date):
#     context_train = context_click[(context_click.date >= train_start) & (context_click.date < test_date)][['cust_no', 'item_id', 'show', 'click']]
#     context_test = context_click[context_click.date == test_date][['cust_no', 'item_id', 'show', 'click']]
    
#     cust_item_pair_train = context_train.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
#     cust_item_pair_test = context_test.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
    
#     # cust_item_pair_train = cust_item_pair_train[cust_item_pair_train.click > 0]
#     # cust_item_pair_test = cust_item_pair_test[cust_item_pair_test.click > 0]
    
#     trainUniqueUsers = set(cust_item_pair_train.cust_no.unique())
#     # testUniqueUsers = set(cust_item_pair_test.cust_no.unique())
#     cust_item_pair_test = cust_item_pair_test[cust_item_pair_test.cust_no.isin(trainUniqueUsers)]
    
#     items_by_cust_train = cust_item_pair_train.groupby('cust_no')['item_id'].unique()
#     items_by_cust_test = cust_item_pair_test.groupby('cust_no')['item_id'].unique()
    
#     cust_items_dict_train = dict(items_by_cust_train.apply(list))
#     cust_items_dict_test = dict(items_by_cust_test.apply(list))
#     return cust_items_dict_train, cust_items_dict_test

In [7]:
user_list = context
train_start = 20211217
test_date = 20211226

In [8]:
def splitting(user_list, train_start, test_date):
    context_train = context[(context.date >= train_start) & (context.date < test_date)][['cust_no', 'item_id', 'show','click']]
    trainUniqueUsers = set(context_train.cust_no.unique())
    context_test = context[context.date == test_date][['cust_no', 'item_id','click']]
    context_test = context_test[context_test.cust_no.isin(trainUniqueUsers)]

    cust_item_pair_train = context_train.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
    cust_item_pair_test = context_test.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()

    cust_no = context_train.cust_no.unique()
    init_df = pd.DataFrame({'cust_no': cust_no})

    combined_train = pd.merge(init_df, cust_item_pair_train, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
    # combined_train.item_id = combined_train.item_id.astype(int)

    combined_test = pd.merge(init_df, cust_item_pair_test, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
    combined_test.item_id = combined_test.item_id.replace(np.nan, -1)
    combined_test = combined_test[combined_test.item_id != -1]
    combined_test.item_id = combined_test.item_id.astype(int)
    
    items_by_cust_train = combined_train.groupby('cust_no')['item_id'].unique()
    items_by_cust_test = combined_test.groupby('cust_no')['item_id'].unique()

    cust_items_dict_train = dict(items_by_cust_train.apply(list))
    cust_items_dict_test = dict(items_by_cust_test.apply(list))
    return cust_items_dict_train, cust_items_dict_test

In [9]:
def export_file(date, pairs_dict_train, pairs_dict_test):
    dir_name = f'date={date}'
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
        
    ## train
    file_path = os.path.join(dir_name, 'train.txt')
    file = open(file_path,"w")
    print(f'write to file path: {file}')
    counts = 0
    for key,value in pairs_dict_train.items():
        file.write('%s' % key)
        # for item in value:
        #     file.write('%s ' % item)
        for i, item in enumerate(value):
            if math.isnan(item):
                continue
            else:
                if i == (len(value)-1):
                    file.write(' %s' % int(item))
                else:
                    file.write(' %s' % int(item))
        file.write('\n')
        counts += 1
    print(counts)
    file.close()
    ## test
    file_path = os.path.join(dir_name, 'test.txt')
    file = open(file_path,"w")
    counts = 0
    print(f'write to file path: {file}')
    
    for key,value in pairs_dict_test.items():
        file.write('%s' % key)
        # for item in value:
        #     file.write('%s ' % item)
        for i, item in enumerate(value):
            if math.isnan(item):
                continue
            else:
                if i == (len(value)-1):
                    file.write(' %s' % int(item))
                    file.write('\n')
                else:
                    file.write(' %s' % int(item))
        counts += 1
    print(counts)
    file.close()

In [None]:
for test_date in range(20211226, 20211231+1):
    cust_items_dict_train, cust_items_dict_test = splitting(user_list = user_list, train_start = 20211217, test_date = test_date) 
    export_file(date = str(test_date), pairs_dict_train = cust_items_dict_train, pairs_dict_test = cust_items_dict_test )

In [10]:
test_date = 20211226
cust_items_dict_train, cust_items_dict_test = splitting(user_list = user_list, train_start = 20211217, test_date = test_date) 
# export_file(date = str(test_date), pairs_dict_train = cust_items_dict_train, pairs_dict_test = cust_items_dict_test )
len_tr = len(list(cust_items_dict_train.keys()))
len_te = len(list(cust_items_dict_test.keys()))
print(len_tr)
print(len_te)
print('---')

450820
54045
---


In [11]:
8852/441968

0.020028599355609456

In [9]:
for test_date in range(20211226, 20211231+1):
    cust_items_dict_train, cust_items_dict_test = splitting(user_list = user_list, train_start = 20211217, test_date = test_date) 
    export_file(date = str(test_date), pairs_dict_train = cust_items_dict_train, pairs_dict_test = cust_items_dict_test )
    len_tr = len(list(cust_items_dict_train.keys()))
    len_te = len(list(cust_items_dict_test.keys()))
    print(len_tr)
    print(len_te)
    print('---')

write to file path: <_io.TextIOWrapper name='date=20211226/train.txt' mode='w' encoding='UTF-8'>
450820
write to file path: <_io.TextIOWrapper name='date=20211226/test.txt' mode='w' encoding='UTF-8'>
54045
450820
54045
---
write to file path: <_io.TextIOWrapper name='date=20211227/train.txt' mode='w' encoding='UTF-8'>
471569
write to file path: <_io.TextIOWrapper name='date=20211227/test.txt' mode='w' encoding='UTF-8'>
89054
471569
89054
---
write to file path: <_io.TextIOWrapper name='date=20211228/train.txt' mode='w' encoding='UTF-8'>
499850
write to file path: <_io.TextIOWrapper name='date=20211228/test.txt' mode='w' encoding='UTF-8'>
99623
499850
99623
---
write to file path: <_io.TextIOWrapper name='date=20211229/train.txt' mode='w' encoding='UTF-8'>
528515
write to file path: <_io.TextIOWrapper name='date=20211229/test.txt' mode='w' encoding='UTF-8'>
98595
528515
98595
---
write to file path: <_io.TextIOWrapper name='date=20211230/train.txt' mode='w' encoding='UTF-8'>
553810
writ

In [12]:
# for test_date in range(20211226, 20211231+1):
#     cust_items_dict_train, cust_items_dict_test = splitting(user_list = user_list, train_start = 20211217, test_date = test_date) 
#     len_tr = len(list(cust_items_dict_train.keys()))
#     len_te = len(list(cust_items_dict_test.keys()))
#     print(len_tr)
#     print(len_te)
#     print('---')

8852
50
---
9606
105
---
10964
124
---
12372
114
---
13693
64
---
14486
47
---
