In [1]:
import sys

module_path=".."
sys.path.append(module_path)

from distillation import distill

In [2]:
import warnings

warnings.filterwarnings("ignore")

import transformers
from transformers.utils import logging

logging.set_verbosity(transformers.logging.ERROR)

### Loop no.1

In [3]:
# Loop 1: Bi-to-Cross Encoder Distillation
"""
direction: "bi2cross" or "cross2bi"
path_dataset: path of dataset, default: STS Benchmark Train Set
path_bi_model: path of distilled Bi-Encoder, if NOT set, initialize with 'base_bi_lm'
path_cross_model: path of distilled Cross-Encoder, if NOT set, initialize with 'base_cross_lm'
base_bi_lm: default: "princeton-nlp/unsup-simcse-roberta-base"
base_cross_lm: default: "roberta-base"
device_name: device for training, default: "cpu"
"""
distill(
    direction="bi2cross",
    n_loop=1,
    device_name="cuda:3",
    # Hyperparams for Training Model (Distillation)
    hyperparams={
        "batch_size": 16,
        "accum_steps": 2,
        "lr": 3e-5,
        "epochs": 5,
        # Training Loss: BCE or MSE
        "loss_func": "BCE"
    }
)

0th Bi-Encoder
-----
[[1.         0.81180199]
 [0.81180199 1.        ]]
SpearmanrResult(correlation=0.8009763763165447, pvalue=4.38233081262766e-309)
=====
0th Cross-Encoder
-----
[[1.         0.15960314]
 [0.15960314 1.        ]]
SpearmanrResult(correlation=0.14341094701933937, pvalue=8.874086483411752e-08)

Pseudo-Labeling..
Done!

Training..
5749 data proceesed
Done!

1th (Newly Distilled) Cross-Encoder
-----
[[1.         0.82434503]
 [0.82434503 1.        ]]
SpearmanrResult(correlation=0.830729034235989, pvalue=0.0)


In [4]:
# Loop 1: Cross-to-Bi Encoder Distillation
distill(
    direction="cross2bi",
    n_loop=1,
    # Trained (Distilled) Cross-Encoder
    path_cross_model="../model/cross-encoder_distilled_loop1_epoch5of5.pth",
    device_name="cuda:3",
    # Hyperparams for Training Model (Distillation)
    hyperparams={
        "batch_size": 16,
        "accum_steps": 1,
        "lr": 3e-7,
        "epochs": 1,
        # Training Loss: BCE or MSE
        "loss_func": "MSE"
    }
)

0th Bi-Encoder
-----
[[1.         0.81180199]
 [0.81180199 1.        ]]
SpearmanrResult(correlation=0.8009763763165447, pvalue=4.38233081262766e-309)
=====
1th Cross-Encoder
-----
[[1.         0.82434503]
 [0.82434503 1.        ]]
SpearmanrResult(correlation=0.830729034235989, pvalue=0.0)

Pseudo-Labeling..
Done!

Training..
5749 data processed
Done!

1th (Newly Distilled) Bi-Encoder
-----
[[1.         0.81621777]
 [0.81621777 1.        ]]
SpearmanrResult(correlation=0.8098011102455234, pvalue=3.44e-321)


### Loop no.2

In [5]:
# Loop 2: Bi-to-Cross Encoder Distillation
distill(
    direction="bi2cross",
    n_loop=2,
    # Trained (Distilled) Bi-Encoder
    path_bi_model="../model/bi-encoder_distilled_loop1_epoch1of1.pth",
    # Trained (Distilled) Cross-Encoder
    path_cross_model="../model/cross-encoder_distilled_loop1_epoch5of5.pth",
    device_name="cuda:3",
    # Hyperparams for Training Model (Distillation)
    hyperparams={
        "batch_size": 16,
        "accum_steps": 2,
        "lr": 1e-5,
        "epochs": 5,
        # Training Loss: BCE or MSE
        "loss_func": "BCE"
    }
)

1th Bi-Encoder
-----
[[1.         0.81621777]
 [0.81621777 1.        ]]
SpearmanrResult(correlation=0.8098011102455234, pvalue=3.44e-321)
=====
1th Cross-Encoder
-----
[[1.         0.82434503]
 [0.82434503 1.        ]]
SpearmanrResult(correlation=0.830729034235989, pvalue=0.0)

Pseudo-Labeling..
Done!

Training..
5749 data proceesed
Done!

2th (Newly Distilled) Cross-Encoder
-----
[[1.         0.81966953]
 [0.81966953 1.        ]]
SpearmanrResult(correlation=0.8344422038243441, pvalue=0.0)


In [6]:
# Loop 2: Cross-to-Bi Encoder Distillation
distill(
    direction="cross2bi",
    n_loop=2,
    # Trained (Distilled) Bi-Encoder
    path_bi_model="../model/bi-encoder_distilled_loop1_epoch1of1.pth",
    # Trained (Distilled) Cross-Encoder
    path_cross_model="../model/cross-encoder_distilled_loop2_epoch5of5.pth",
    device_name="cuda:3",
    # Hyperparams for Training Model (Distillation)
    hyperparams={
        "batch_size": 16,
        "accum_steps": 1,
        "lr": 3e-8,
        "epochs": 1,
        # Training Loss: BCE or MSE
        "loss_func": "MSE"
    }
)

1th Bi-Encoder
-----
[[1.         0.81621777]
 [0.81621777 1.        ]]
SpearmanrResult(correlation=0.8098011102455234, pvalue=3.44e-321)
=====
2th Cross-Encoder
-----
[[1.         0.81966953]
 [0.81966953 1.        ]]
SpearmanrResult(correlation=0.8344422038243441, pvalue=0.0)

Pseudo-Labeling..
Done!

Training..
5749 data processed
Done!

2th (Newly Distilled) Bi-Encoder
-----
[[1.         0.81584442]
 [0.81584442 1.        ]]
SpearmanrResult(correlation=0.8101051454555238, pvalue=1.285e-321)


### Loop no.3

In [7]:
# Loop 3: Bi-to-Cross Encoder Distillation
distill(
    direction="bi2cross",
    n_loop=3,
    # Trained (Distilled) Bi-Encoder
    path_bi_model="../model/bi-encoder_distilled_loop2_epoch1of1.pth",
    # Trained (Distilled) Cross-Encoder
    path_cross_model="../model/cross-encoder_distilled_loop2_epoch5of5.pth",
    device_name="cuda:3",
    # Hyperparams for Training Model (Distillation)
    hyperparams={
        "batch_size": 16,
        "accum_steps": 2,
        "lr": 7e-7,
        "epochs": 5,
        # Training Loss: BCE or MSE
        "loss_func": "BCE"
    }
)

2th Bi-Encoder
-----
[[1.         0.81584442]
 [0.81584442 1.        ]]
SpearmanrResult(correlation=0.8101051454555238, pvalue=1.285e-321)
=====
2th Cross-Encoder
-----
[[1.         0.81966953]
 [0.81966953 1.        ]]
SpearmanrResult(correlation=0.8344422038243441, pvalue=0.0)

Pseudo-Labeling..
Done!

Training..
5749 data proceesed
Done!

3th (Newly Distilled) Cross-Encoder
-----
[[1.         0.81990786]
 [0.81990786 1.        ]]
SpearmanrResult(correlation=0.8356097036108407, pvalue=0.0)


In [8]:
# Loop 3: Cross-to-Bi Encoder Distillation
distill(
    direction="cross2bi",
    n_loop=3,
    # Trained (Distilled) Bi-Encoder
    path_bi_model="../model/bi-encoder_distilled_loop2_epoch1of1.pth",
    # Trained (Distilled) Cross-Encoder
    path_cross_model="../model/cross-encoder_distilled_loop3_epoch5of5.pth",
    device_name="cuda:3",
    # Hyperparams for Training Model (Distillation)
    hyperparams={
        "batch_size": 16,
        "accum_steps": 1,
        "lr": 1e-8,
        "epochs": 1,
        # Training Loss: BCE or MSE
        "loss_func": "MSE"
    }
)

2th Bi-Encoder
-----
[[1.         0.81584442]
 [0.81584442 1.        ]]
SpearmanrResult(correlation=0.8101051454555238, pvalue=1.285e-321)
=====
3th Cross-Encoder
-----
[[1.         0.81990786]
 [0.81990786 1.        ]]
SpearmanrResult(correlation=0.8356097036108407, pvalue=0.0)

Pseudo-Labeling..
Done!

Training..
5749 data processed
Done!

3th (Newly Distilled) Bi-Encoder
-----
[[1.         0.81561413]
 [0.81561413 1.        ]]
SpearmanrResult(correlation=0.8102031033209242, pvalue=9.3e-322)
