# Imports

In [325]:
import learning2read
learning2read.reload_all()
from learning2read.b04 import *
Data = DataMgr(PATH_MAC['data'], verbose=1)
File = PathMgr(PATH_MAC['cache'])

# Load Data

In [326]:
raw_train = Data('btrain')
raw_test = Data('btest')
raw_implicit = Data('implicit')
raw_user = Data('user')
raw_book = Data('book')
raw_dataset = {
    'raw_train' : raw_train,
    'raw_test' : raw_test,
    'raw_implicit' : raw_implicit,
    'raw_user' : raw_user,
    'raw_book' : raw_book,
}

/Users/qtwu/Downloads/data/book_ratings_train.csv
/Users/qtwu/Downloads/data/book_ratings_test.csv
/Users/qtwu/Downloads/data/implicit_ratings.csv
/Users/qtwu/Downloads/data/users.csv
/Users/qtwu/Downloads/data/books.csv


## Truncated Data For Testing

In [327]:
m = 1
raw_dataset_truncated = {
    'raw_train' : raw_train.sample(5000*m),
    'raw_test' : raw_test.sample(2500*m),
    'raw_implicit' : raw_implicit.sample(10000*m),
    'raw_user' : raw_user.sample(5000*m),
    'raw_book' : raw_book.sample(5000*m),
}

# Procedure Definition

## Preprocessing

In [328]:
proc4007 = [
    {
        'class' : 'learning2read.preprocessing.TotalDataFrame',
        'output' : 'df_total',
        'input_data' : ['raw_train', 'raw_test', 'raw_implicit'],
    },
    {
        'class' : 'learning2read.preprocessing.CleanRawUser',
        'output' : 'user_info',
        'input_data' : ['df_total', 'raw_user'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.CleanRawBook',
        'output' : 'book_info',
        'input_data' : ['df_total', 'raw_book'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.UserRatingSqueeze',
        'output' : 'user_rating',
        'input_data' : 'df_total',
        'filter_num' : 3,
        'statistics' : ['num','quantile11','mean','mode','std','skew','kurtosis'],
#         'statistics' : ['num','mean','mode','std','skew','kurtosis'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.UserPadding',
        'output' : 'user_rating',
        'input_data' : ['df_total', 'user_rating'],
        'isna_name' : 'User_no_book',
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.BookRatingSqueeze',
        'output' : 'book_rating',
        'input_data' : 'df_total',
        'filter_num' : 3,
        'statistics' : ['num','quantile11','mean','mode','std','skew','kurtosis'],
#         'statistics' : ['num','mean','mode','std','skew','kurtosis'],
        'na_policy' : 'median',
    },
    {
        'class' : 'learning2read.preprocessing.BookPadding',
        'output' : 'book_rating',
        'input_data' : ['df_total', 'book_rating'],
        'isna_name' : 'ISBN_no_user',
        'na_policy' : 'median',
    },
    {
        'class'  : 'learning2read.b04.BookVectorPow2AutoEncoder',
        'output' : 'book_vector',
        'input_data' : 'df_total',
        'domain_filter_num' : 2, # book with >=2 users
        'codomain_filter_num' : 400, # user with >=400 books
        'param' : {
            'code_length' : 32, 
            'activation' : 'SELU', 
            'solver' : 'Adam', 
            'learning_rate' : 0.01,
            'epochs' : 20,
            'random_state' : 1,
        },
    },
    {
        'class'  : 'learning2read.b04.UserVectorPow2AutoEncoder',
        'output' : 'user_vector',
        'input_data' : 'df_total',
        'domain_filter_num' : 2,
        'codomain_filter_num' : 200,
        'param' : {
            'code_length' : 32, 
            'activation' : 'SELU', 
            'solver' : 'Adam', 
            'learning_rate' : 0.01,
            'epochs' : 20,
            'random_state' : 1,
        },
    },
    {
        'class'  : 'learning2read.preprocessing.UserBookTable',
        'output' : 'df_total_features', # (X,y={-1,0,1,2,...,10})
        'input_data' : ['df_total', 'user_rating', 'book_rating', 'user_info', 'book_info', 'user_vector', 'book_vector'],
        'na_policy' : None, # should fill it before training
    },
]

## Learn & Output

In [323]:
proc4007.extend([
    {
        'class'  : 'learning2read.preprocessing.RowFilter',
        'output' : 'df_train',
        'input_data' : 'df_total_features',
        'func' : r"lambda df : df['Book-Rating']>0",
    },
    {
        'class'  : 'learning2read.b04.LightGBMRegressor',
        'output' : 'model_lgbm',
        'input_data' : 'df_train',
        'param' : { # tune by linux10, used in 4006
            'num_leaves' : 149,
            'learning_rate' : 0.220460,
            'n_estimators' : 121,
            'min_child_samples' : 49,
            'seed' : 1,
        }
    },
    {
        'class'  : 'learning2read.preprocessing.RowFilter',
        'output' : 'df_test',
        'input_data' : 'df_total_features',
        'func' : r"lambda df : df['Book-Rating']<0",
    },
    {
        'class'  : 'learning2read.submission.Track1',
        'output' : 'track1',
        'input_data' : ['df_test', 'model_lgbm'],
    },
    {
        'class'  : 'learning2read.submission.Track2',
        'output' : 'track2',
        'input_data' : ['df_test', 'model_lgbm'],
    },
])

## [DEV] Preprocessing Proc

In [235]:
# proc4007 = [
#     {
#         'class' : 'learning2read.preprocessing.TotalDataFrame',
#         'output' : 'df_total',
#         'input_data' : ['raw_train', 'raw_test', 'raw_implicit'],
#     },
# #     {
# #         'class' : 'learning2read.preprocessing.CleanRawUser',
# #         'output' : 'user_info',
# #         'input_data' : ['df_total', 'raw_user'],
# #         'na_policy' : 'median',
# #     },
#     {
#         'class' : 'learning2read.preprocessing.CleanRawBook',
#         'output' : 'book_info',
#         'input_data' : ['df_total', 'raw_book'],
#         'na_policy' : 'median',
#     },
# #     {
# #         'class'  : 'learning2read.preprocessing.UserBookTable',
# #         'output' : 'df_total_features', # (X,y={-1,0,1,2,...,10})
# #         'input_data' : ['df_total', 'book_info'],
# #         'na_policy' : None, # should fill it before training
# #     },
# ]

## DataFrame View of proc4007

In [329]:
pd.DataFrame(proc4007)[['class','input_data','output']]

Unnamed: 0,class,input_data,output
0,learning2read.preprocessing.TotalDataFrame,"[raw_train, raw_test, raw_implicit]",df_total
1,learning2read.preprocessing.CleanRawUser,"[df_total, raw_user]",user_info
2,learning2read.preprocessing.CleanRawBook,"[df_total, raw_book]",book_info
3,learning2read.preprocessing.UserRatingSqueeze,df_total,user_rating
4,learning2read.preprocessing.UserPadding,"[df_total, user_rating]",user_rating
5,learning2read.preprocessing.BookRatingSqueeze,df_total,book_rating
6,learning2read.preprocessing.BookPadding,"[df_total, book_rating]",book_rating
7,learning2read.b04.BookVectorPow2AutoEncoder,df_total,book_vector
8,learning2read.b04.UserVectorPow2AutoEncoder,df_total,user_vector
9,learning2read.preprocessing.UserBookTable,"[df_total, user_rating, book_rating, user_info...",df_total_features


# Run

In [330]:
# reload
import learning2read
learning2read.reload_all()
from learning2read.b04 import *

from learning2read.proc import Procedure
P = Procedure(proc4007,verbose=True)
P.load_data(raw_dataset_truncated)
# P.load_data(raw_dataset)
P.run()

run_id(0): {'class': 'learning2read.preprocessing.TotalDataFrame', 'output': 'df_total', 'input_data': ['raw_train', 'raw_test', 'raw_implicit']}
run_id(1): {'class': 'learning2read.preprocessing.CleanRawUser', 'output': 'user_info', 'input_data': ['df_total', 'raw_user'], 'na_policy': 'median'}
run_id(2): {'class': 'learning2read.preprocessing.CleanRawBook', 'output': 'book_info', 'input_data': ['df_total', 'raw_book'], 'na_policy': 'median'}
run_id(3): {'class': 'learning2read.preprocessing.UserRatingSqueeze', 'output': 'user_rating', 'input_data': 'df_total', 'filter_num': 3, 'statistics': ['num', 'quantile11', 'mean', 'mode', 'std', 'skew', 'kurtosis'], 'na_policy': 'median'}
run_id(4): {'class': 'learning2read.preprocessing.UserPadding', 'output': 'user_rating', 'input_data': ['df_total', 'user_rating'], 'isna_name': 'User_no_book', 'na_policy': 'median'}
run_id(5): {'class': 'learning2read.preprocessing.BookRatingSqueeze', 'output': 'book_rating', 'input_data': 'df_total', 'filte

last_done_proc_id = 9
   len(proc_list) = 10
var : dict_keys(['raw_train', 'raw_test', 'raw_implicit', 'raw_user', 'raw_book', 'df_total', 'user_info', 'book_info', 'user_rating', 'book_rating', 'book_vector', 'user_vector', 'df_total_features'])

# Submit

In [9]:
Doc = PathMgr(r"~/mlfinal/")
t1=pd.DataFrame(P.var['track1'])
t1.to_csv(Doc('t1_4007_use_4006_lgbm.csv'),header=None,index=None)

# Homework Package :p

In [2]:
import homework
from homework import *
reload(homework)
pass

<IPython.core.display.Javascript object>

$\def \t {\text}
\def \sp {\space}
\def \spc {\space\space\space}
\def \spe {\space\space\space\space\space}
\def \bc {\because}
\def \A {\mathcal{A}}
\def \H {\mathcal{H}}
\def \m {\mathcal{m}}
\def \R {\mathbb{R}}
\def \N {\mathbb{N}}
\def \Z {\mathbb{Z}}
\def \Q {\mathbb{Q}}
\def \C {\mathbb{C}}
\def \say {\underbrace}
\def \L {\mathbb{L}}
\def \P {\mathbb{P}}
\def \F {\mathbb{F}}
\def \sbc {\spc\because}
\def \sbe {\spe\because}
\def \defeq {\overset{def}{=}}
\def \ctr {\rightarrow\leftarrow}
\def \st {\t{ such that }}
\def \inner#1 #2{\langle #1 \mid #2 \rangle}
\def \norm#1{\lVert #1 \rVert}
\def \bm#1 {\begin{bmatrix} #1 \end{bmatrix}}
\def \pt {\partial}
\def \ppt#1 #2{\frac{\pt #1}{\pt #2}}
\def\l {\mathscr{l}}
\def \vphi {\varphi}$
