# Import & Load

In [None]:
import learning2read
learning2read.reload_all()
from learning2read.b04 import *
Data = DataMgr(PATH_MAC['data'], verbose=1)
File = PathMgr(PATH_MAC['cache'])

In [None]:
raw_train = Data('btrain')
raw_test = Data('btest')
raw_implicit = Data('implicit')
raw_user = Data('user')
raw_book = Data('book')
raw_dataset = {
    'raw_train' : raw_train,
    'raw_test' : raw_test,
    'raw_implicit' : raw_implicit,
    'raw_user' : raw_user,
    'raw_book' : raw_book,
}

In [None]:
m = 1
raw_dataset_truncated = {
    'raw_train' : raw_train.sample(int(5000*m)),
    'raw_test' : raw_test.sample(int(2500*m)),
    'raw_implicit' : raw_implicit.sample(int(10000*m)),
    'raw_user' : raw_user.sample(int(5000*m)),
    'raw_book' : raw_book.sample(int(5000*m)),
}

# Proc40072

In [None]:
proc40072 = [
    {
        'class' : 'learning2read.preprocessing.TotalDataFrame',
        'output' : 'df_total',
        'input_data' : ['raw_train', 'raw_test', 'raw_implicit'],
    },
    {
        'class' : 'learning2read.preprocessing.CleanRawUser',
        'output' : 'user_info',
        'input_data' : ['df_total', 'raw_user'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.CleanRawBook',
        'output' : 'book_info',
        'input_data' : ['df_total', 'raw_book'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.UserRatingSqueeze',
        'output' : 'user_rating',
        'input_data' : 'df_total',
        'filter_num' : 3,
        'statistics' : ['num','quantile11','mean','mode','std','skew','kurtosis'],
#         'statistics' : ['num','mean','mode','std','skew','kurtosis'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.UserPadding',
        'output' : 'user_rating',
        'input_data' : ['df_total', 'user_rating'],
        'isna_name' : 'User_no_book',
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.BookRatingSqueeze',
        'output' : 'book_rating',
        'input_data' : 'df_total',
        'filter_num' : 3,
        'statistics' : ['num','quantile11','mean','mode','std','skew','kurtosis'],
#         'statistics' : ['num','mean','mode','std','skew','kurtosis'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.BookPadding',
        'output' : 'book_rating',
        'input_data' : ['df_total', 'book_rating'],
        'isna_name' : 'ISBN_no_user',
        'na_policy' : 'median',
    },
    {
        'class'  : 'learning2read.b04.BookVectorPow2AutoEncoder',
        'output' : 'book_vector',
        'input_data' : 'df_total',
        'domain_filter_num' : 2, # book with >=2 users
        'codomain_filter_num' : 400, # user with >=400 books
        'param' : {
            'code_length' : 32, 
            'activation' : 'SELU', 
            'solver' : 'Adam', 
            'learning_rate' : 0.01,
            'epochs' : 20,
            'random_state' : 1,
        },
    },
    {
        'class'  : 'learning2read.b04.UserVectorPow2AutoEncoder',
        'output' : 'user_vector',
        'input_data' : 'df_total',
        'domain_filter_num' : 2,
        'codomain_filter_num' : 200,
        'param' : {
            'code_length' : 32, 
            'activation' : 'SELU', 
            'solver' : 'Adam', 
            'learning_rate' : 0.01,
            'epochs' : 20,
            'random_state' : 1,
        },
    },
    {
        'class'  : 'learning2read.preprocessing.UserBookTable',
        'output' : 'df_total_features', # (X,y={-1,0,1,2,...,10})
        'input_data' : ['df_total', 'user_rating', 'book_rating', 'user_info', 'book_info', 'user_vector', 'book_vector'],
        'na_policy' : None, # should fill it before training
    },
]

# CV Procs

In [121]:
# reload
import learning2read
learning2read.reload_all()
from learning2read.b04 import *

K_fold = 5

FG = FileGen(
    save=lambda pid,data:save_pickle(File("proc40072_5fold_%d"%(pid)), data),
    raw_dataset=raw_dataset_truncated,
    proc=proc40072,
    K_fold=K_fold,
    seed_fold=1
)

from multiprocessing import Pool
pool = Pool()
pool.map_async(FG.cv_preapre, range(K_fold))
# p0=FG.cv_preapre(4)
# p0

<multiprocessing.pool.MapResult at 0x1a21ed2080>

In [122]:
File()

['.DS_Store',
 'track1_lgbm_tuned_3_058754.csv',
 't',
 'track1_lgbm_tuned_2_059275.csv',
 'P6_6.pk',
 't1.csv',
 'track1_lgbm_tuned.csv',
 'p4004_done',
 'df_total_selected']

# DEV Code (Old)

In [97]:
# P0 = ProcValidation.run(
#     raw_dataset_truncated,
#     proc40072,
#     0,
#     5,
#     1,
#     verbose = True
# )['output']

run_id(1): {'class': 'learning2read.preprocessing.CleanRawUser', 'output': 'user_info', 'input_data': ['df_total', 'raw_user'], 'na_policy': 'median'}
run_id(2): {'class': 'learning2read.preprocessing.CleanRawBook', 'output': 'book_info', 'input_data': ['df_total', 'raw_book'], 'na_policy': 'median'}
run_id(3): {'class': 'learning2read.preprocessing.UserRatingSqueeze', 'output': 'user_rating', 'input_data': 'df_total', 'filter_num': 3, 'statistics': ['num', 'quantile11', 'mean', 'mode', 'std', 'skew', 'kurtosis'], 'na_policy': 'median'}
run_id(4): {'class': 'learning2read.preprocessing.UserPadding', 'output': 'user_rating', 'input_data': ['df_total', 'user_rating'], 'isna_name': 'User_no_book', 'na_policy': 'median'}
run_id(5): {'class': 'learning2read.preprocessing.BookRatingSqueeze', 'output': 'book_rating', 'input_data': 'df_total', 'filter_num': 3, 'statistics': ['num', 'quantile11', 'mean', 'mode', 'std', 'skew', 'kurtosis'], 'na_policy': 'median'}
run_id(6): {'class': 'learning2r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
# P = Procedure(proc40072,verbose=True)
# P.load_data(raw_dataset_truncated)
# # P.load_data(raw_dataset)
# P.run_id(0)
# def validation_set(P, i=0, K=5):
#     df = P.var['df_total']
#     df_train = df.loc[df['Book-Rating']>0, :]
#     N = df_train.shape[0]
#     idx_valid = IndexFold(K, N, 1)[i]
#     idx_valid = df_train.iloc[idx_valid, :].index
#     y_valid = df.loc[idx_valid, 'Book-Rating']
#     df_sub = df.iloc[:, :]
#     df_sub.loc[idx_valid, 'Book-Rating'] = -2
#     P.var['df_total'] = df_sub.iloc[:, :]
#     return (P, y_valid)
# P, y_valid = validation_set(P, 1, 5)

# P.run()

# RF = learning2read.preprocessing.RowFilter
# df_train = RF.run(P.var['df_total_features'],r"lambda df:df['Book-Rating']>0",)['output']
# df_valid = RF.run(P.var['df_total_features'],r"lambda df:df['Book-Rating']==-2",)['output']
# df_valid.loc[y_valid.index, 'Book-Rating'] = y_valid

# P.var['df_total_features']

In [92]:
# {
#     'class' : 'learning2read.b04.ProcValidation',
#     'output' : 'P',
#     'input_data' : 'raw_dataset',
#     'proc' : proc40072, # run_id(0) -> hold y_valid -> run all
#     'K_fold' : 5,
#     'i_fold' : 0,
#     'seed_fold' : 1,
# }
# # PV1.load_data({'raw_dataset': raw_dataset})
# # PV1.var['P'].var['df_train']
# # PV1.var['P'].var['df_valid']
# pass

In [None]:
import homework
from homework import *
reload(homework)
pass