In [None]:
!nvidia-smi

In [1]:
import torch
import os
import pandas as pd

from deepmatcher.data.custom_dataset import DeepMatcherDataset
from deepmatcher.data.encoder.text_encoders import HFTextEncoder
from deepmatcher.optim import Optimizer

from torch.utils.data import DataLoader
from deepmatcher.models.core import MatchingModel

In [2]:
dataset_dir = "sample_data/itunes-amazon/"
train_df = pd.read_csv(os.path.join(dataset_dir, 'train.csv'))
print('train:', train_df.shape)
val_df = pd.read_csv(os.path.join(dataset_dir, 'validation.csv'))
print('val:', val_df.shape)
test_df = pd.read_csv(os.path.join(dataset_dir, 'train.csv'))
print('test:', test_df.shape)

train_df.head()

train: (323, 18)
val: (108, 18)
test: (323, 18)


Unnamed: 0,id,label,left_Song_Name,left_Artist_Name,left_Album_Name,left_Genre,left_Price,left_CopyRight,left_Time,left_Released,right_Song_Name,right_Artist_Name,right_Album_Name,right_Genre,right_Price,right_CopyRight,right_Time,right_Released
0,448,0,Baby When the Light ( David Guetta & Fred Rist...,David Guetta,Pop Life ( Extended Version ) [ Bonus Version ],"Dance , Music , Rock , Pop , House , Electroni...",$ 1.29,‰ ãÑ 2007 Gum Records,6:17,18-Sep-07,Revolver ( Madonna Vs. David Guetta Feat . Lil...,David Guetta,One Love ( Deluxe Version ),Dance & Electronic,$ 1.29,( C ) 2014 Swedish House Mafia Holdings Ltd ( ...,3:18,"August 21 , 2009"
1,287,1,Outversion,Mark Ronson,Version,"Pop , Music , R&B / Soul,Soul,Dance,Rock,Jazz,...",$ 0.99,2007 Mark Ronson under exclusive license to SO...,1:50,10-Jul-07,Outversion,Mark Ronson,Version [ Explicit ],Pop,$ 0.99,( c ) 2011 J'adore Records,1:50,"July 10 , 2007"
2,534,0,Peer Pressure ( feat . Traci Nelson ),Snoop Dogg,Doggumentary,"Hip-Hop/Rap , Music , Rock , Gangsta Rap , Wes...",$ 1.29,"‰ ãÑ 2011 Capitol Records , LLC . All rights r...",4:07,29-Mar-11,Boom ( ( Feat . T-Pain ) [ Edited ] ),Snoop Dogg,Doggumentary [ Edited ],"Rap & Hip-Hop , West Coast",$ 1.29,"( C ) 2011 Capitol Records , LLC",3:50,"March 29 , 2011"
3,181,1,Stars Come Out ( Tim Mason Remix ),Zedd,Stars Come Out ( Remixes ) - EP,"Dance , Music , Electronic , House",$ 1.29,2012 Dim Mak Inc.,5:49,20-May-14,Stars Come Out ( Dillon Francis Remix ),Zedd,Stars Come Out [ Dillon Francis Remix ],Dance & Electronic,$ 1.29,2012 Dim Mak Inc.,4:08,"May 20 , 2014"
4,485,0,Jump ( feat . Nelly Furtado ),Flo Rida,R.O.O.T.S. ( Deluxe Version ),"Hip-Hop/Rap , Music",$ 1.29,‰ ãÑ 2009 Atlantic Recording Corporation for t...,3:28,30-Mar-09,"Yayo [ Feat . Brisco , Billy Blue , Ball Greez...",Flo Rida,R.O.O.T.S. ( Route Of Overcoming The Struggle ...,Rap & Hip-Hop,$ 1.29,"( C ) 2012 Motown Records , a Division of UMG ...",7:53,"March 30 , 2009"


In [3]:
text_transformer = HFTextEncoder('sentence-transformers/paraphrase-albert-small-v2', max_length=16, truncation=True, padding='max_length', trainable=True)

In [4]:
label_column = 'label'
text_columns = ['Song_Name', 'Artist_Name', 'Album_Name', 'Genre', 'Price', 'CopyRight', 'Time', 'Released']
image_col = ''
tokenizer = text_transformer.tokenizer

train_dataset = DeepMatcherDataset(
    data_df=train_df,
    label_col=label_column,
    text_cols=text_columns,
    image_col=image_col,
    tokenizer=tokenizer
)

val_dataset = DeepMatcherDataset(
    data_df=val_df,
    label_col=label_column,
    text_cols=text_columns,
    image_col=image_col,
    tokenizer=tokenizer
)

test_dataset = DeepMatcherDataset(
    data_df=test_df,
    label_col=label_column,
    text_cols=text_columns,
    image_col=image_col,
    tokenizer=tokenizer
)


received data_df : (323, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 323 non-null    int64 
 1   label              323 non-null    int64 
 2   left_Song_Name     323 non-null    object
 3   left_Artist_Name   323 non-null    object
 4   left_Album_Name    323 non-null    object
 5   left_Genre         323 non-null    object
 6   left_Price         323 non-null    object
 7   left_CopyRight     323 non-null    object
 8   left_Time          323 non-null    object
 9   left_Released      312 non-null    object
 10  right_Song_Name    323 non-null    object
 11  right_Artist_Name  323 non-null    object
 12  right_Album_Name   323 non-null    object
 13  right_Genre        323 non-null    object
 14  right_Price        323 non-null    object
 15  right_CopyRight    323 non-null    object
 16  right_Time     

In [5]:
for batch in train_dataset:
    break
batch

{'attrs': defaultdict(dict,
             {'Song_Name': {'left_': {'input_ids': tensor([   2, 1578,   76,   14,  471,   13,    5,  684, 4835, 8232,  279, 4250,
                         761, 9959, 5039,    3]),
                'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])},
               'right_': {'input_ids': tensor([    2, 18497,    13,     5, 12257,  4611,     9,   684,  4835,  8232,
                         9806,    13,     9,  9672,    13,     3]),
                'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}},
              'Artist_Name': {'left_': {'input_ids': tensor([   2,  684, 4835, 8232,    3,    0,    0,    0,    0,    0,    0,    0,
                           0,    0,    0,    0]),
                'token_type_ids': tensor([0, 0, 0, 0

In [6]:
batch['attrs']['Song_Name']['left_']['input_ids'].shape

torch.Size([16])

In [7]:
model = MatchingModel(
    text_encoder=text_transformer,
    attr_summarizer=None,
    attr_condense_factor='auto',
    attr_comparator='concat',
    attr_merge='concat',
    classifier='2-layer-highway',
    hidden_size=300
    )

In [8]:
model

MatchingModel(
  (text_encoder): HFTextEncoder(
    (model): AlbertModel(
      (embeddings): AlbertEmbeddings(
        (word_embeddings): Embedding(30000, 128, padding_idx=0)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0, inplace=False)
      )
      (encoder): AlbertTransformer(
        (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
        (albert_layer_groups): ModuleList(
          (0): AlbertLayerGroup(
            (albert_layers): ModuleList(
              (0): AlbertLayer(
                (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (attention): AlbertAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                

In [9]:
init_batch = next(iter(DataLoader(train_dataset, batch_size=2)))['attrs'] ## draw a batch to initialize the lazy model 
model.initialize(train_dataset, init_batch=init_batch)
model



MatchingModel(
  (text_encoder): HFTextEncoder(
    (model): AlbertModel(
      (embeddings): AlbertEmbeddings(
        (word_embeddings): Embedding(30000, 128, padding_idx=0)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0, inplace=False)
      )
      (encoder): AlbertTransformer(
        (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
        (albert_layer_groups): ModuleList(
          (0): AlbertLayerGroup(
            (albert_layers): ModuleList(
              (0): AlbertLayer(
                (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (attention): AlbertAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                

In [12]:
optimizer = Optimizer(lr=1e-3, lr_decay=1e-1, start_decay_at=3)

In [11]:
model.run_train(
    train_dataset,
    val_dataset,
    best_save_path='best_mode.pth',
    batch_size=16,
    epochs=15,
    
    )

* Number of trainable parameters: 12960586
===>  TRAIN Epoch 1


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:08


Finished Epoch 1 || Run Time:    8.4 | Load Time:    0.3 || F1:  21.57 | Prec:  44.00 | Rec:  14.29 || Ex/s:  37.26

===>  EVAL Epoch 1


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 1 || Run Time:    2.2 | Load Time:    0.2 || F1:  51.43 | Prec:  81.82 | Rec:  37.50 || Ex/s:  44.72

Updating Learning Rate: 8.000e-04 Acc: 51.42856979370117 Epoch: 1
* Best F1: tensor(51.4286, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 2


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 2 || Run Time:    6.5 | Load Time:    0.3 || F1:  67.20 | Prec:  87.50 | Rec:  54.55 || Ex/s:  47.40

===>  EVAL Epoch 2


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 2 || Run Time:    2.2 | Load Time:    0.1 || F1:  65.22 | Prec:  68.18 | Rec:  62.50 || Ex/s:  46.78

Updating Learning Rate: 6.400e-04 Acc: 65.2173843383789 Epoch: 2
* Best F1: tensor(65.2174, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 3


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 3 || Run Time:    6.6 | Load Time:    0.3 || F1:  82.43 | Prec:  85.92 | Rec:  79.22 || Ex/s:  46.69

===>  EVAL Epoch 3


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 3 || Run Time:    2.2 | Load Time:    0.1 || F1:  73.47 | Prec:  72.00 | Rec:  75.00 || Ex/s:  46.15

Updating Learning Rate: 5.120e-04 Acc: 73.46939086914062 Epoch: 3
* Best F1: tensor(73.4694, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 4


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 4 || Run Time:    6.6 | Load Time:    0.3 || F1:  92.62 | Prec:  95.83 | Rec:  89.61 || Ex/s:  46.78

===>  EVAL Epoch 4


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 4 || Run Time:    2.2 | Load Time:    0.1 || F1:  72.34 | Prec:  73.91 | Rec:  70.83 || Ex/s:  46.75

Updating Learning Rate: 4.096e-04 Acc: 72.3404312133789 Epoch: 4
---------------------

===>  TRAIN Epoch 5


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 5 || Run Time:    6.5 | Load Time:    0.3 || F1:  96.64 | Prec: 100.00 | Rec:  93.51 || Ex/s:  47.23

===>  EVAL Epoch 5


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 5 || Run Time:    2.2 | Load Time:    0.1 || F1:  72.34 | Prec:  73.91 | Rec:  70.83 || Ex/s:  46.16

Updating Learning Rate: 3.277e-04 Acc: 72.3404312133789 Epoch: 5
---------------------

===>  TRAIN Epoch 6


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 6 || Run Time:    6.8 | Load Time:    0.3 || F1:  99.35 | Prec:  98.72 | Rec: 100.00 || Ex/s:  45.40

===>  EVAL Epoch 6


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 6 || Run Time:    2.2 | Load Time:    0.1 || F1:  73.91 | Prec:  77.27 | Rec:  70.83 || Ex/s:  46.85

Updating Learning Rate: 2.621e-04 Acc: 73.91304016113281 Epoch: 6
* Best F1: tensor(73.9130, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 7


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 7 || Run Time:    6.6 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  46.83

===>  EVAL Epoch 7


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 7 || Run Time:    2.3 | Load Time:    0.1 || F1:  71.11 | Prec:  76.19 | Rec:  66.67 || Ex/s:  44.62

Updating Learning Rate: 2.097e-04 Acc: 71.1111068725586 Epoch: 7
---------------------

===>  TRAIN Epoch 8


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 8 || Run Time:    6.5 | Load Time:    0.3 || F1:  99.35 | Prec: 100.00 | Rec:  98.70 || Ex/s:  47.12

===>  EVAL Epoch 8


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 8 || Run Time:    2.1 | Load Time:    0.1 || F1:  69.57 | Prec:  72.73 | Rec:  66.67 || Ex/s:  47.54

Updating Learning Rate: 1.678e-04 Acc: 69.5652084350586 Epoch: 8
---------------------

===>  TRAIN Epoch 9


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 9 || Run Time:    6.5 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  47.22

===>  EVAL Epoch 9


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 9 || Run Time:    2.2 | Load Time:    0.1 || F1:  69.57 | Prec:  72.73 | Rec:  66.67 || Ex/s:  47.38

Updating Learning Rate: 1.342e-04 Acc: 69.5652084350586 Epoch: 9
---------------------

===>  TRAIN Epoch 10


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 10 || Run Time:    6.5 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  47.65

===>  EVAL Epoch 10


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 10 || Run Time:    2.2 | Load Time:    0.1 || F1:  69.57 | Prec:  72.73 | Rec:  66.67 || Ex/s:  47.55

Updating Learning Rate: 1.074e-04 Acc: 69.5652084350586 Epoch: 10
---------------------

===>  TRAIN Epoch 11


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:07


Finished Epoch 11 || Run Time:    7.1 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  43.73

===>  EVAL Epoch 11


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 11 || Run Time:    2.3 | Load Time:    0.1 || F1:  72.34 | Prec:  73.91 | Rec:  70.83 || Ex/s:  44.39

Updating Learning Rate: 8.590e-05 Acc: 72.3404312133789 Epoch: 11
---------------------

===>  TRAIN Epoch 12


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 12 || Run Time:    6.5 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  47.37

===>  EVAL Epoch 12


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 12 || Run Time:    2.2 | Load Time:    0.1 || F1:  69.57 | Prec:  72.73 | Rec:  66.67 || Ex/s:  47.41

Updating Learning Rate: 6.872e-05 Acc: 69.5652084350586 Epoch: 12
---------------------

===>  TRAIN Epoch 13


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 13 || Run Time:    6.6 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  46.47

===>  EVAL Epoch 13


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 13 || Run Time:    2.2 | Load Time:    0.1 || F1:  69.57 | Prec:  72.73 | Rec:  66.67 || Ex/s:  47.69

Updating Learning Rate: 5.498e-05 Acc: 69.5652084350586 Epoch: 13
---------------------

===>  TRAIN Epoch 14


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 14 || Run Time:    6.6 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  46.50

===>  EVAL Epoch 14


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 14 || Run Time:    2.2 | Load Time:    0.1 || F1:  69.57 | Prec:  72.73 | Rec:  66.67 || Ex/s:  47.36

Updating Learning Rate: 4.398e-05 Acc: 69.5652084350586 Epoch: 14
---------------------

===>  TRAIN Epoch 15


0% [████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 15 || Run Time:    6.6 | Load Time:    0.3 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  46.90

===>  EVAL Epoch 15


0% [█] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


Finished Epoch 15 || Run Time:    2.3 | Load Time:    0.1 || F1:  72.34 | Prec:  73.91 | Rec:  70.83 || Ex/s:  43.77

Updating Learning Rate: 3.518e-05 Acc: 72.3404312133789 Epoch: 15
---------------------

Loading best model...
Training done.


tensor(73.9130, device='cuda:0')

In [None]:
model.run_eval(test_dataset)