In [5]:
import torch
import random
import pandas as pd
import json

from torch.utils.data import Dataset

import torch
import random
import numpy as np
import torch.backends.cudnn as cudnn

from tqdm import tqdm
from collections import defaultdict

from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.nn import CosineEmbeddingLoss

from sentence_transformers import SentenceTransformer, InputExample, losses
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(seed)
# seed 결과가달라짐 3~4%

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print('device:', device)

device: cuda:0


In [7]:
print('-'*10)
print('Data Loading Start!!')
print('-'*10)

## dataset class
path = "./aug_data.jsonl"

with open(path) as f:
    data = [json.loads(line) for line in f]
data = pd.DataFrame(data)
print('origin_data:', data.shape)

query = []
doc = []
label = []
for i, (q, d, domain) in enumerate(zip(list(data['question']), list(data['content']), list(data['domain']))): # Domain = 주제
    if type(q) != str: continue
    if type(d) != str: continue            

    same_domain_desc_list = pd.unique(data[data['domain']==domain]['content'])
    other_domain_desc_list =  pd.unique(data[data['domain']!=domain]['content'])

    # Positive Pair
    query.append(q)
    doc.append(d)
    label.append(1)

    # Negative Pair
        # Hard
    negative_sample_count = 20 if len(same_domain_desc_list) >= 20 else len(same_domain_desc_list)
    desc_list_sample = random.sample(list(same_domain_desc_list), k=negative_sample_count)
    for dl in desc_list_sample:
        if d == dl: continue
        else: 
            query.append(q)
            doc.append(dl)
            label.append(-1)
        
        # Soft
    other_desc_list_sample = random.sample(list(other_domain_desc_list), 5)
    for dl in other_desc_list_sample:
        query.append(q)
        doc.append(dl)
        label.append(-1)
            
# query, doc 및 label 리스트를 사용하여 train_examples 생성
train_dataset = []
for q, d, l in zip(query, doc, label):
    train_dataset.append(InputExample(texts=[q, d], label=float(l)))
print('Training Data Amount:', len(train_dataset))

print('-'*10)
print('Data Loading Complete!!')
print('-'*10)

----------
Data Loading Start!!
----------
origin_data: (12816, 4)
Training Data Amount: 323280
----------
Data Loading Complete!!
----------


In [14]:
model = SentenceTransformer("sroberta_finetuned_epoch_1")

## data loader
batch_size = 16
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

## loss 
train_loss = losses.CosineSimilarityLoss(model)

print('-'*10)
print('Training Start!!')
print('-'*10)

# Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
        
print('-'*10)
print('Training Complete!!')
print('-'*10)

----------
Training Start!!
----------


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20205 [00:00<?, ?it/s]

----------
Training Complete!!
----------


In [15]:
model_path = './sroberta_finetuned_epoch_2'
model.save(model_path)