In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from sklearn.model_selection import KFold

In [3]:
from fastai.callbacks.tracker import *
from fastai.text import *
from fastai.tabular import *
from fastai_tab_text import *

In [4]:
def reset_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     tf.set_random_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
reset_seed()

In [5]:
mercari_path = Path('data/')

In [7]:
from fastai.utils.collect_env import show_install
show_install()



```text
=== Software === 
python        : 3.7.1
fastai        : 1.0.51
fastprogress  : 0.1.19
torch         : 1.0.0
nvidia driver : 410.104
torch cuda    : 9.0.176 / is available
torch cudnn   : 7401 / is enabled

=== Hardware === 
nvidia gpus   : 1
torch devices : 1
  - gpu0      : 8116MB | GeForce GTX 1080

=== Environment === 
platform      : Linux-4.15.0-47-generic-x86_64-with-debian-stretch-sid
distro        : #50~16.04.1-Ubuntu SMP Fri Mar 15 16:06:21 UTC 2019
conda env     : python37
python        : /home/quantran/anaconda3/envs/python37/bin/python
sys.path      : 
/home/quantran/kwon/kaggle/mercari
/home/quantran/anaconda3/envs/python37/lib/python37.zip
/home/quantran/anaconda3/envs/python37/lib/python3.7
/home/quantran/anaconda3/envs/python37/lib/python3.7/lib-dynload
/home/quantran/anaconda3/envs/python37/lib/python3.7/site-packages
/home/quantran/anaconda3/envs/python37/lib/python3.7/site-packages/IPython/extensions
/home/quantran/.ipython
```

Please make sure to include 

# Prepare data

In [6]:
def preprocess_text_cols(df: pd.DataFrame) -> pd.DataFrame:
  
    df['category_name'] = df['category_name'].fillna('//')
    df['category1'] = df['category_name'].apply(lambda x : x.split('/')[0].strip())
    df.loc[df.category1=='','category1']= np.NaN
    df['category2'] = df['category_name'].apply(lambda x : x.split('/')[1].strip())
    df.loc[df.category2=='','category2']= np.NaN
    df['category3'] = df['category_name'].apply(lambda x : x.split('/')[2].strip())
    df.loc[df.category3=='','category3']= np.NaN
    df['category_name'] = df['category_name'].apply( lambda x : ' '.join( x.split('/') ).strip() )
    df.loc[df.category_name=='','category_name']= 'No category' # let this info in when concatenating text for RNN
    
    df_bn_fillna = df['brand_name'].fillna('No brand')
    df['text'] = (df['name'].fillna('No name') + '. ' + df_bn_fillna + '. ' + 
                  df['category_name'] + '. ' + df['item_description'].fillna('No description'))
    return df[['category1','category2','category3','brand_name', 'text', 'shipping', 'item_condition_id','price']]

def preprocess_all(sample=None):
    train = pd.read_table(mercari_path/'train.tsv').drop('train_id',axis=1)
    price = train.price.values
    train=train.drop('price',axis=1)
    train['price']=price
    
    test = pd.read_table(mercari_path/'test_stg2.tsv').drop('test_id',axis=1)
    test['price'] = np.NAN
    train = train[train['price'] > 0].reset_index(drop=True)
    all_df = pd.concat([train,test],axis=0).reset_index(drop=True)
    del train
    del test
    gc.collect()

    all_df = preprocess_text_cols(all_df)
    train_df = all_df[~all_df.price.isnull()]
    test_df = all_df[all_df.price.isnull()]
    del all_df
    gc.collect()
    
    if sample:
        np.random.seed(42)
        sample = np.random.permutation(sample)
        train_df = train_df.loc[sample].reset_index(drop=True)
        
    test_df= test_df.drop('price',axis=1)    
    return train_df,test_df

def preprocess_train(sample=None):
    train = pd.read_table(mercari_path/'train.tsv').drop('train_id',axis=1)
    price = train.price.values
    train=train.drop('price',axis=1)
    train['price']=price

    if sample:
        np.random.seed(42)
        sample = np.random.permutation(sample)
        train = train.loc[sample].reset_index(drop=True)

    train = preprocess_text_cols(train)

    return train
def get_val_idxs(train,n_splits=20):
    np.random.seed(42)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_idxs, valid_idxs = next(cv.split(train))
    return train_idxs,valid_idxs

In [7]:
n=1482535 # train shape

In [8]:
gc.collect()

7

In [13]:
# train_df,test_df = preprocess_all(int(0.01*n))

# train_df.shape,test_df.shape

In [9]:
# train_df = preprocess_train(int(0.01*n))

In [8]:
train_df = preprocess_train()
train_df.shape

(1482535, 8)

In [20]:
train_df.columns

Index(['category1', 'category2', 'category3', 'brand_name', 'text', 'shipping',
       'item_condition_id', 'price'],
      dtype='object')

In [10]:
train_df.price = np.log1p(train_df['price']) # so we can use MSE in NN

In [23]:
train_df.head()

Unnamed: 0,category1,category2,category3,brand_name,text,shipping,item_condition_id,price
0,Beauty,Makeup,Lips,SeneGence,LipSense bundle FOR Moomoo's sister ;). SeneGe...,0,1,4.094345
1,Women,Shoes,Pumps,,Silver Prom Shoes. No brand. Women Shoes Pumps...,1,2,2.772589
2,Women,Pants,Casual Pants,Eileen Fisher,Eileen Fisher organic cotton pants 1X. Eileen ...,0,1,4.204693
3,Women,Tops & Blouses,T-Shirts,Adidas,Adorable Adidas Crop Top White for Karen. Adid...,0,2,3.044522
4,Women,Tops & Blouses,"Tank, Cami",Brandy Melville,Brandy Melville Front-Tie Tank Top. Brandy Mel...,0,1,2.564949


In [27]:
train_df.text.head()[1]

'Silver Prom Shoes. No brand. Women Shoes Pumps. Only worn once, about a 3 inch heel'

In [11]:
cat_names=['category1','category2','category3','brand_name','shipping']
# cont_names= list(set(train.columns) - set(cat_names) - {'AdoptionSpeed'})
cont_names= list(set(train_df.columns) - set(cat_names) - {'price','text'})
print(f'# of continuous feas: {len(cont_names)}')
print(f'# of categorical feas: {len(cat_names)}')
dep_var = 'price'
procs = [FillMissing,Categorify, Normalize]

txt_cols=['text']

len(cat_names) + len(cont_names) + 2 == train_df.shape[1]

# of continuous feas: 1
# of categorical feas: 5


True

In [12]:
train_idxs,val_idxs = get_val_idxs(train_df,n_splits=20)
train_idxs,val_idxs
train_idxs.shape,val_idxs.shape

(array([    0,     1,     2,     3, ..., 14821, 14822, 14823, 14824]),
 array([   19,    27,    47,    99, ..., 14766, 14777, 14791, 14815]))

((14083,), (742,))

In [17]:
def get_tabulartext_databunch(bs=100,val_idxs=val_idxs,path=mercari_path):
    data_lm = load_data(path, 'data_lm.pkl', bs=bs)
    collate_fn = partial(mixed_tabular_pad_collate, pad_idx=1, pad_first=True)
    reset_seed()
    return (TabularTextList.from_df(train_df, cat_names, cont_names, txt_cols, vocab=data_lm.vocab, procs=procs, path=path)
                            .split_by_idx(val_idxs)
                            .label_from_df(cols=dep_var)
#                             .add_test(TabularTextList.from_df(test_df, cat_names, cont_names, txt_cols,path=path))
                            .databunch(bs=bs,collate_fn=collate_fn, no_check=False))

In [14]:
encoder_name = 'bs60-awdlstm-enc-stage2'
def get_tabtext_lr_find(data,params,seed=42):
    reset_seed(seed)
    learn_lf = tabtext_learner(data,AWD_LSTM,metrics=[root_mean_squared_error],**params).to_fp16()
    learn_lf.load_encoder(encoder_name)
    return learn_lf.to_fp32()

def get_tabulartext_learner(data,params,seed=42):
    reset_seed(seed)
    learn= tabtext_learner(data,AWD_LSTM,metrics=[root_mean_squared_error],
                               callback_fns=[partial(SaveModelCallback, monitor='root_mean_squared_error',mode='min',every='improvement',name='best_nn')],
                               **params).to_fp16() # because the language model is trained in fp16
    learn.load_encoder(encoder_name)
    return learn.to_fp32()

# Training - stage 1 (train head)

In [15]:
params={
    'layers':[500,400,200],
    'bptt':70,
    'max_len':20*70,
    'drop_mult': 1., # drop_mult: multiply to different dropouts in AWD LSTM
    'lin_ftrs': [300],
    'ps_lin_ftrs': [0],
    'ps': [0.001,0,0],
    'emb_drop': 0.,
    'y_range': [0,6],
    'use_bn': True,    
}
bs=100

## Get tabular text databunch

In [18]:
data = get_tabulartext_databunch(bs=bs)

In [19]:
data.show_batch()

category1,category2,category3,brand_name,shipping,item_condition_id,target
#na#,#na#,#na#,FOREVER 21,0,0.1012,3.218876
Vintage & Collectibles,Serving,Mug,#na#,0,-1.0018,2.8332133
Women,Dresses,Knee-Length,H&M,1,-1.0018,2.8332133
Beauty,Skin Care,Face,Origins,0,-1.0018,2.7080503
Women,Sweaters,Crewneck,PINK,0,1.2042,3.0910425


text,target
"xxeos(3) xxup(6) purple(287) listing(262) xxup(6) iced(4084) xxup(6) 0(217) xxup(6) to(24) xxup(6) n't(118) ,(10) xxrep(7) purple(287) listing(262) ,(10) xxup(6) brand(12) sugar(1104) ,(10) xxup(6) skinny(331) use(215) iced(4084) 0(217) ,(10) xxup(6) home(57) they(148) ,(10) xxup(6) not(74) shipping(41)",3.218876
"xxeos(3) xxup(6) tag(270) xxup(6) xs(273) xxup(6) swamp(8561) yoga(743) ,(10) xxup(6) brand(12) and(13) ,(10) xxup(6) cell(143) the(17) xxup(6) 100(175) xxup(6) stored(2245) xxup(6) yoga(743) ,(10) xxup(6) and(13) in(22) xxup(6) makeup(54) of(28) -(18) soon(1521) ask(197) pink(38) t(70) got(595) ,(10) :(31) daily(517) up(114) true(499) /(29) -(18) xxrep(7) lancôme(3278) box(99) too(211) slip(647) generation(1607) ,(10) xxup(6) 39(2926) no(11) :(31) really(458) came(1270) up(114) it(34) athletic(66) men(64) lips(258) no(11) a(20) sand(2126) with(25) household(1997) size(23) women(14) see(200) strap(489) pink(38) t(70) me(93) keep(600) 's(26) 6(88) brown(289) ,(10) xxup(6) do(109) pink(38) accessories(49) wax(1118) size(23) are(48) lines(1876) :(31) tags(105) deals(945) with(25) light(178) it(34) ](42) fits(224) #(134) ;(414) please(94) come(360) size(23) fees(1432)",2.8332133
"xxeos(3) xxup(6) casual(987) month(854) 18(390) so(125) &(16) &(16) xxup(6) to(24) shoes(61) women(14) ,(10) xxrep(7) loss(1717) xxup(6) !(15) xxup(6) been(171) xxup(6) phone(226) for(19) xxup(6) excellent(204) ,(10) xxup(6) casual(987) month(854) 18(390) so(125) &(16) &(16) xxup(6) to(24) shoes(61) women(14) shipped(493) save(177) ,(10) xxup(6) have(77) /(29) includes(240) to(24) ,(10) xxup(6) -(18) shipped(493) to(24) of(28) athletic(66) bluetooth(866) skirt(431) ,(10) xxup(6) let(413)",2.8332133
"xxeos(3) xxup(6) finished(4118) nordstrom(1763) jenner(1630) 2016(1321) firm(120) ,(10) xxup(6) finished(4118) ,(10) xxup(6) rm(45) xxup(6) back(168) xxup(6) skin(167) xxup(6) just(133) ,(10) xxup(6) in(22) will(69)",2.7080503
"xxeos(3) xxup(6) suede(795) ((39) ,(10) xxrep(7) ((39) ,(10) xxup(6) !(15) xxup(6) both(264) xxup(6) suede(795) ,(10) xxup(6)",3.0910425


## Get tabular text learner

In [16]:
learn = get_tabulartext_learner(data,params,seed=42).to_fp32()
gc.collect()

153

In [17]:
learn.model

SequentialMultipleInput(
  (0): MultiBatchMixEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(33781, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(33781, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1150, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1150, 1150, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1150, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearTabularTextClassifier(
    (rnn_lin_layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.4)
      (2): Linear(in_features=1200, out_features=300, bias=True)
      (3): ReLU(inplace)
      (4): Bat

In [18]:
# train on all data from training. Took a whole night
learn.fit_one_cycle(6,max_lr=1e-02,pct_start=0.3,moms=(0.8,0.7))

epoch,train_loss,valid_loss,root_mean_squared_error,time
0,0.308574,0.297962,0.543225,1:08:28
1,0.322636,0.325656,0.565951,1:08:32
2,0.308093,0.320362,0.561940,1:08:28
3,0.301868,0.287178,0.532738,1:08:28
4,0.287716,0.280896,0.527114,1:08:32
5,0.283715,0.277816,0.523942,1:08:35


Better model found at epoch 0 with root_mean_squared_error value: 0.543224573135376.
Better model found at epoch 3 with root_mean_squared_error value: 0.5327377915382385.
Better model found at epoch 4 with root_mean_squared_error value: 0.5271139144897461.
Better model found at epoch 5 with root_mean_squared_error value: 0.5239419341087341.


In [19]:
# learn.save('full2-stage1')

# Stage 2: train all

In [None]:
# _=learn.load('full2-stage1')

In [20]:
# learn.unfreeze()
# learn.fit_one_cycle(2,max_lr=slice(?,?),pct_start=0.3,moms=(0.8,0.7))

# learn.save('full2-unfreeze')

# Test prediction

In [20]:
# # not enough memory for this task
# params={
#     'layers':[500,400,200],
#     'bptt':70,
#     'max_len':20*70,
#     'drop_mult': 1., # drop_mult: multiply to different dropouts in AWD LSTM
#     'lin_ftrs': [300],
#     'ps_lin_ftrs': [0],
#     'ps': [0.001,0,0],
#     'emb_drop': 0.,
#     'y_range': [0,6],
#     'use_bn': True,    
# }
# bs=100

# data = get_tabulartext_databunch(bs=bs)

# learn = get_tabulartext_learner(data,params,seed=42).to_fp32()
# gc.collect()

# _=learn.load('full2-stage1')

# test_pred=np.squeeze(to_np(learn.get_preds(DatasetType.Test)[0]))