In [5]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics import f1_score, classification_report
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.ERROR)
from ast import literal_eval
from torch import cuda
import json
device = 'cuda:0' if cuda.is_available() else 'cpu'

class Dataset_768(Dataset):
	def __init__(self, dataframe):
		self.tokenizer = tokenizer
		self.data = dataframe
		self.vector = dataframe.vector
		self.targets = self.data.intent

	def __getitem__(self, index):
		return {
			'vector': torch.tensor(self.vector, dtype=torch.float),
			'targets': torch.tensor(self.targets[index], dtype=torch.int)
		}

# Define Model

class DistilBERTClass(nn.Module):
	def __init__(self, num_intents):
		super(DistilBERTClass, self).__init__()
		self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
		self.fc1 = nn.Sequential(
			nn.Linear(768, 64),
			nn.BatchNorm1d(64),
			nn.ReLU(),
		)
		self.fc2 = nn.Sequential(
			nn.Linear(64, num_intents)
		)

	def forward(self, input_ids, attention_mask):
		output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
		hidden_state = output_1[0]
		pooler = hidden_state[:, 0]
		pooler = self.fc1(pooler)
		output = self.fc2(pooler)
		return output

class DistilBERTClass_noFinetune(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass_noFinetune, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1.last_hidden_state # output_1[0]이랑 같음 (아마도)
        pooler = hidden_state[:, 0]
        return pooler

class MLP_768(nn.Module):
	def __init__(self, num_intents):
		super(MLP_768, self).__init__()
		self.fc1 = nn.Sequential(
			nn.Linear(768, 64),
			nn.BatchNorm1d(64),
			nn.ReLU(),
		)
		self.fc2 = nn.Linear(64, num_intents)
		self.sigmoid = nn.Sigmoid()

	def forward(self, x):
		x = self.fc1(x)
		x = self.fc2(x)
		x = self.sigmoid(x)
		return x

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBERTClass_noFinetune().to(device)

user_intents = ['initial_query', 'greeting', 'add_filter', 'remove_filter', 'continue', 'accept_response', 'reject_response', 'others']
system_intents = ['feedback_request', 'detail_attribute_request', 'passive_recommend', 'active_recommend', 'parroting_response', 'sympathetic_response', 'others']
music_attributes = ['track', 'artist', 'year', 'popularity', 'culture', 'similar_track', 'similar_artist', 'user', 'theme', 'mood', 'genre', 'instrument', 'vocal', 'tempo', 'none']
intents_dict = {'user': user_intents, 'system': system_intents, 'music': music_attributes}

df = pd.read_csv('./most_recent.csv', encoding='unicode_escape')
df['intent'] = df['intent'].apply(literal_eval)
df['music_attribute'] = df['music_attribute'].apply(literal_eval)

# df = df[df['dialog_id'].apply(lambda x: x not in error_dialog_id)]

# 20개 이하인 intent는 others로 변경 (question 13개, answer 7개)
df["intent"] = df["intent"].apply(lambda x: ["others" if item in ["item_attribute_answer", "item_attribute_question"] else item for item in x])

# others 외의 intent가 함께 있으면 others 제거
def remove_others_if_not_alone(intents):
	if 'others' in intents and len(intents) > 1:
		intents.remove('others')
	return intents
df['intent'] = df['intent'].apply(remove_others_if_not_alone)

# initial query와 함께 [remove_filter, continue, accept_response, reject_response, others]가 있으면 제거
def preprocess_initial(row):
	if 'initial_query' in row['intent']:
		for intent_to_remove in ['remove_filter', 'continue', 'accept_response', 'reject_response', 'others']:
			if intent_to_remove in row['intent']:
				row['intent'].remove(intent_to_remove)
	return row
df = df.apply(preprocess_initial, axis=1)

#######################
def concat_previous_1_rows(group):
	if len(group) < 1:
		return pd.DataFrame()
	group = group.copy()
	group['content'] = group['content'].shift(1).fillna('') + '. ' + group['content']
	group['content'].iloc[0] = group['content'].iloc[0].lstrip('. ')
	return group

def concat_previous_2_rows(group):
	if len(group) < 2:
		return pd.DataFrame()
	group = group.copy()
	group['content'] = group['content'].shift(2).fillna('') + '. ' + group['content'].shift(1).fillna('') + '. ' + group['content']
	group['content'].iloc[0] = group['content'].iloc[0].lstrip('. ')
	group['content'].iloc[1] = group['content'].iloc[1].lstrip('. ')
	return group

def concat_previous_4_rows(group):
	if len(group) < 4:
		return pd.DataFrame()
	group = group.copy()
	group['content'] = group['content'].shift(4).fillna('') + '. ' + group['content'].shift(3).fillna('') + '. ' + group['content'].shift(2).fillna('') + '. ' + group['content'].shift(1).fillna('') + '. ' + group['content']
	for i in range(4):
		group['content'].iloc[i] = group['content'].iloc[i].lstrip('. ')
	return group

def concat_previous_6_rows(group):
	if len(group) < 6:
		return pd.DataFrame()
	group = group.copy()
	group['content'] = group['content'].shift(6).fillna('') + '. ' + group['content'].shift(5).fillna('') + '. ' + group['content'].shift(4).fillna('') + '. ' + group['content'].shift(3).fillna('') + '. ' + group['content'].shift(2).fillna('') + '. ' + group['content'].shift(1).fillna('') + '. ' + group['content']
	for i in range(6):
		group['content'].iloc[i] = group['content'].iloc[i].lstrip('. ')
	return group

def concat_previous_8_rows(group):
	if len(group) < 8:
		return pd.DataFrame()
	group = group.copy()
	group['content'] = (
		group['content'].shift(8).fillna('') + '. ' +
		group['content'].shift(7).fillna('') + '. ' +
		group['content'].shift(6).fillna('') + '. ' + 
		group['content'].shift(5).fillna('') + '. ' + 
		group['content'].shift(4).fillna('') + '. ' + 
		group['content'].shift(3).fillna('') + '. ' + 
		group['content'].shift(2).fillna('') + '. ' + 
		group['content'].shift(1).fillna('') + '. ' + 
		group['content']
	)
	for i in range(8):
		group['content'].iloc[i] = group['content'].iloc[i].lstrip('. ')
	return group

# 'dialog_id'별로 그룹화하여 이전 n개 row를 concat
df_1 = df.groupby('dialog_id').apply(concat_previous_1_rows).reset_index(drop=True)
df_2 = df.groupby('dialog_id').apply(concat_previous_2_rows).reset_index(drop=True)
df_4 = df.groupby('dialog_id').apply(concat_previous_4_rows).reset_index(drop=True)
df_6 = df.groupby('dialog_id').apply(concat_previous_6_rows).reset_index(drop=True)
df_8 = df.groupby('dialog_id').apply(concat_previous_8_rows).reset_index(drop=True)

df = df
################################
# Define Dataset Class

class MultiLabelDataset(Dataset):
	def __init__(self, dataframe, tokenizer, max_len):
		self.tokenizer = tokenizer
		self.data = dataframe
		self.text = dataframe.content
		self.targets = self.data.intent
		self.max_len = max_len

	def __len__(self):
		return len(self.text)

	def __getitem__(self, index):
		text = str(self.text[index])
		text = " ".join(text.split())

		tokens = self.tokenizer.tokenize(text)
		if len(tokens) > self.max_len:
			tokens = tokens[-self.max_len:]
		truncated_text = self.tokenizer.convert_tokens_to_string(tokens)

		inputs = self.tokenizer.encode_plus(
			truncated_text,
			None,
			add_special_tokens=True,
			max_length=self.max_len,
			pad_to_max_length=True,
			return_token_type_ids=False,
   			truncation=True
		)
		ids = inputs['input_ids']
		mask = inputs['attention_mask']

		return {
			'ids': torch.tensor(ids, dtype=torch.int),
			'mask': torch.tensor(mask, dtype=torch.int),
			'targets': torch.tensor(self.targets[index], dtype=torch.int)
		}

# Define functions

def decode_intents(encoded_list):
	return [intent for intent, flag in zip(intents_dict[data_type], encoded_list) if flag == 1]

def text_to_768(text):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_token_type_ids=False,
        truncation=True,
        return_tensors='pt'
    )
    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        output = model(ids, mask)
    
    return output.cpu().numpy().flatten()

df['vector'] = df['content'].apply(text_to_768)

##########################

user_df = df[df['role']=='user']
system_df = df[df['role']=='system']

del user_df['role']
del user_df['music_attribute']
del system_df['role']
del system_df['music_attribute']

def encode_intents(intent_list, intents):
	return [1 if intent in intent_list else 0 for intent in intents]

user_df.loc[:, 'intent'] = user_df['intent'].apply(lambda x: encode_intents(x, user_intents))
user_df = user_df.reset_index(drop=True)

system_df.loc[:, 'intent'] = system_df['intent'].apply(lambda x: encode_intents(x, system_intents))
system_df = system_df.reset_index(drop=True)

music_df = df[['index','dialog_id', 'role', 'content', 'music_attribute']]
music_df.loc[:, 'music_attribute'] = music_df['music_attribute'].apply(lambda x: encode_intents(x, music_attributes))
music_df.rename(columns={'music_attribute': 'intent'}, inplace=True)
music_df = music_df.reset_index(drop=True)

user_y = torch.stack([torch.tensor(item) for item in user_df['intent']])
system_y = torch.stack([torch.tensor(item) for item in system_df['intent']])
music_y = torch.stack([torch.tensor(item) for item in music_df['intent']])

# Train, Valid Split

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in msss.split(user_df['content'].values, user_y):
	user_train_df, user_val_df = user_df.iloc[train_index], user_df.iloc[test_index]
	user_train_y, user_val_y = user_y[train_index], user_y[test_index]

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in msss.split(user_val_df['content'].values, user_val_y):
	user_val_df, user_test_df = user_val_df.iloc[train_index], user_val_df.iloc[test_index]
	user_val_y, user_test_y = user_val_y[train_index], user_val_y[test_index]

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in msss.split(system_df['content'].values, system_y):
	system_train_df, system_val_df = system_df.iloc[train_index], system_df.iloc[test_index]
	system_train_y, system_val_y = system_y[train_index], system_y[test_index]

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in msss.split(system_val_df['content'].values, system_val_y):
	system_val_df, system_test_df = system_val_df.iloc[train_index], system_val_df.iloc[test_index]
	system_val_y, system_test_y = system_val_y[train_index], system_val_y[test_index]
 
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in msss.split(music_df['content'].values, music_y):
	music_train_df, music_val_df = music_df.iloc[train_index], music_df.iloc[test_index]
	music_train_y, music_val_y = music_y[train_index], music_y[test_index]

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in msss.split(music_val_df['content'].values, music_val_y):
	music_val_df, music_test_df = music_val_df.iloc[train_index], music_val_df.iloc[test_index]
	music_val_y, music_test_y = music_val_y[train_index], music_val_y[test_index]

user_train_df = user_train_df.reset_index(drop=True)
user_val_df = user_val_df.reset_index(drop=True)
user_test_df = user_test_df.reset_index(drop=True)

system_train_df = system_train_df.reset_index(drop=True)
system_val_df = system_val_df.reset_index(drop=True)
system_test_df = system_test_df.reset_index(drop=True)

music_train_df = music_train_df.reset_index(drop=True)
music_val_df = music_val_df.reset_index(drop=True)
music_test_df = music_test_df.reset_index(drop=True)

# Generate Data Dictionary

data_dict = {
	'user': {
		'train': {
			'dataframe': user_train_df,
			'label': user_train_y
		},
		'val': {
			'dataframe': user_val_df,
			'label': user_val_y
		},
  		'test': {
			'dataframe': user_test_df,
			'label': user_test_y
		}
	},
	'system': {
		'train': {
			'dataframe': system_train_df,
			'label': system_train_y
		},
		'val': {
			'dataframe': system_val_df,
			'label': system_val_y
		},
  		'test': {
			'dataframe': system_test_df,
			'label': system_test_y
		}
	},
	'music': {
		'train': {
			'dataframe': music_train_df,
			'label': music_train_y
		},
		'val': {
			'dataframe': music_val_df,
			'label': music_val_y
		},
  		'test': {
			'dataframe': music_test_df,
			'label': music_test_y
		}
	}
}

In [13]:
user_test_df

Unnamed: 0,index,dialog_id,unique_id,content,intent,vector
0,8,b8e368058df16085,b8e368058df16085:3:user,Thank you,"[0, 0, 0, 0, 0, 1, 0, 0]","[-0.077586114, 0.07368618, 0.06842183, -0.0907..."
1,32,dac004c6ae9787b2,dac004c6ae9787b2:0:user,"Hi, I'd love to create a playlist of Jazz songs","[1, 1, 1, 0, 0, 0, 0, 0]","[-0.047902066, -0.10909475, -0.010763254, -0.1..."
2,34,dac004c6ae9787b2,dac004c6ae9787b2:1:user,"Yes, from 2020 please","[0, 0, 1, 0, 0, 0, 0, 0]","[-0.15013573, -0.29752833, 0.27389917, -0.3062..."
3,56,b8a2eeb06bc2a045,b8a2eeb06bc2a045:2:user,"Great songs, could we get some more like this?","[0, 0, 0, 0, 1, 1, 0, 0]","[0.14025909, -0.26175424, 0.082671985, -0.0402..."
4,58,b8a2eeb06bc2a045,b8a2eeb06bc2a045:3:user,Great! could we get some more like Vampire Wee...,"[0, 0, 1, 0, 0, 1, 0, 0]","[0.23933716, -0.1676852, 0.1416594, -0.1552548..."
...,...,...,...,...,...,...
458,9646,41ed9c8c163f6a70,41ed9c8c163f6a70:2:user,These are nice! Got any songs from H.E.R?,"[0, 0, 1, 0, 0, 1, 0, 0]","[-0.020169634, -0.3400174, -0.033068836, -0.02..."
459,9666,1dee3e19595f524c,1dee3e19595f524c:1:user,I mostly listen to classical music to help me ...,"[0, 0, 1, 0, 0, 0, 0, 0]","[0.018026177, 0.0075391917, -0.15731312, -0.17..."
460,9692,d9134f35c8a3a787,d9134f35c8a3a787:2:user,Oh these are good. Can we add some songs from ...,"[0, 0, 1, 0, 0, 1, 0, 0]","[0.07373679, -0.07481862, -0.067963, -0.002229..."
461,9702,6206cf8dccd420fb,6206cf8dccd420fb:2:user,nice please more,"[0, 0, 0, 0, 1, 1, 0, 0]","[-0.08899486, -0.11149166, 0.13286069, -0.1453..."


### Distilbert로 768차원 Embedding 뽑아서 저장하기

In [27]:
MAX_LEN = 128
BATCH_SIZE = 64

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 4}
num_intents_dict = {'user': 8, 'system': 7, 'music': 15}
loss_criterion = torch.nn.BCEWithLogitsLoss()

for data_type in ['music']:
	training_set = MultiLabelDataset(data_dict[data_type]['train']['dataframe'], tokenizer, MAX_LEN)
	valid_set = MultiLabelDataset(data_dict[data_type]['val']['dataframe'], tokenizer, MAX_LEN)
	test_set = MultiLabelDataset(data_dict[data_type]['test']['dataframe'], tokenizer, MAX_LEN)	
 
	training_loader = DataLoader(training_set, **params)
	valid_loader = DataLoader(valid_set, **params)
	test_loader = DataLoader(test_set, **params)
 
	num_intents = num_intents_dict[data_type]
 
	model = DistilBERTClass_noFinetune(num_intents)
	model.to(device)
	model.eval()

	train_vectors_list = []
	val_vectors_list = []
	test_vectors_list = []
 
	with torch.no_grad():
		for i, data in tqdm(enumerate(training_loader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			outputs = model(ids, mask)
			train_vectors_list.append(outputs)
		for i, data in tqdm(enumerate(valid_loader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			outputs = model(ids, mask)
			val_vectors_list.append(outputs)
		for i, data in tqdm(enumerate(test_loader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			outputs = model(ids, mask)
			test_vectors_list.append(outputs)
			break

	# train_vector = torch.cat(train_vectors_list, dim=0)
	# val_vector = torch.cat(val_vectors_list, dim=0)
	# test_vector = torch.cat(test_vectors_list, dim=0)
	# print(train_vector.shape, val_vector.shape, test_vector.shape)
 
	# torch.save(train_vector, f'./distilbert_768vectors/{data_type}_train.pt')
	# torch.save(val_vector, f'./distilbert_768vectors/{data_type}_val.pt')
	# torch.save(test_vector, f'./distilbert_768vectors/{data_type}_test.pt')

                  

hidden state torch.Size([64, 128, 768])




### Best threshold 얻어서 최종 결과 확인하기 (학습된 모델 불러와서)

In [None]:
MAX_LEN = 128
BATCH_SIZE = 64

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 4}
num_intents_dict = {'user': 8, 'system': 7, 'music': 15}
loss_criterion = torch.nn.BCEWithLogitsLoss()

for data_type in ['music']:
	training_set = MultiLabelDataset(data_dict[data_type]['train']['dataframe'], tokenizer, MAX_LEN)
	valid_set = MultiLabelDataset(data_dict[data_type]['val']['dataframe'], tokenizer, MAX_LEN)
	test_set = MultiLabelDataset(data_dict[data_type]['test']['dataframe'], tokenizer, MAX_LEN)	
 
	training_loader = DataLoader(training_set, **params)
	valid_loader = DataLoader(valid_set, **params)
	test_loader = DataLoader(test_set, **params)
 
	num_intents = num_intents_dict[data_type]
 
	# Load Best Model
	model = DistilBERTClass(num_intents)
	model.load_state_dict(torch.load('./models/' + data_type + '_model.pth'))
	model.to(device)

 	# Test with threshold = 0.5
	model.eval()
	probability_outputs=[]
	with torch.no_grad():
		for i, data in tqdm(enumerate(test_loader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			targets = data['targets'].to(device, dtype = torch.float)
			outputs = model(ids, mask)
			probability_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
	probability_outputs = np.array(probability_outputs)
   
	binary_outputs = (probability_outputs >= 0.5)
	binary_outputs[np.all(binary_outputs == False, axis=1), -1] = True

	# Calculate Result
	report = classification_report(data_dict[data_type]['test']['label'], binary_outputs, output_dict=True)
	df_report = pd.DataFrame(report).transpose()
	df_report.reset_index(inplace=True)
	df_report.rename(columns={'support': 'tag_count', 'index': 'tag'}, inplace=True)
	df_report.loc[:num_intents-1, 'tag'] = intents_dict[data_type]
	df_report[['precision', 'recall', 'f1-score']] = df_report[['precision', 'recall', 'f1-score']].round(4)

	print(df_report['f1-score'].values)

	# Find best threshold with validation set
	model.eval()
	labels = []
	probability_outputs=[]
	with torch.no_grad():
		for i, data in tqdm(enumerate(valid_loader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			targets = data['targets'].to(device, dtype = torch.float)
			outputs = model(ids, mask)
			labels.extend(targets.cpu().detach().numpy().tolist())
			probability_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

	labels = np.array(labels)
	probability_outputs = np.array(probability_outputs)

	best_thresholds = np.zeros(num_intents)
	thresholds = [(i+1)/100 for i in range(100)]
	for label_idx in range(num_intents):
		best_label_f1 = 0.0
		for threshold in thresholds:
			binary_preds = (probability_outputs[:, label_idx] >= threshold).astype(int)
			f1 = f1_score(labels[:, label_idx], binary_preds)
			if f1 > best_label_f1:
				best_label_f1 = f1
				best_thresholds[label_idx] = threshold

	print(best_thresholds)
	
	# Test with best thresholds
	model.eval()
	labels = []
	probability_outputs=[]
	with torch.no_grad():
		for i, data in tqdm(enumerate(test_loader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			targets = data['targets'].to(device, dtype = torch.float)
			outputs = model(ids, mask)
			labels.extend(targets.cpu().detach().numpy().tolist())
			probability_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

	labels = np.array(labels)
	probability_outputs = np.array(probability_outputs)
 
	binary_outputs = (probability_outputs >= best_thresholds).astype(int)
	binary_outputs[np.all(binary_outputs == False, axis=1), -1] = True
 
	report = classification_report(labels[:, :-1], binary_outputs[:, :-1], output_dict=True)
	df_report = pd.DataFrame(report).transpose()
	df_report.reset_index(inplace=True)
	df_report.rename(columns={'support': 'tag_count', 'index': 'tag'}, inplace=True)
	#print(df_report.loc[:num_intents-2, 'tag'] )
	df_report.loc[:num_intents-2, 'tag'] = intents_dict[data_type][:-1]
	df_report[['precision', 'recall', 'f1-score']] = df_report[['precision', 'recall', 'f1-score']].round(2)
	df_report = df_report[df_report['tag'].apply(lambda x: x not in ['micro avg', 'weighted avg', 'samples avg'])]

df_report[['precision', 'recall', 'f1-score']]

### 학습코드

In [None]:
MAX_LEN = 128
BATCH_SIZE = 64
LEARNING_RATE = 2e-4
NUM_EPOCH = 15
patience = 2
count = 0

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 4}
num_intents_dict = {'user': 8, 'system': 7, 'music': 15}

for data_type in ['music']:
	training_set = MultiLabelDataset(data_dict[data_type]['train']['dataframe'], tokenizer, MAX_LEN)
	valid_set = MultiLabelDataset(data_dict[data_type]['val']['dataframe'], tokenizer, MAX_LEN)
	test_set = MultiLabelDataset(data_dict[data_type]['test']['dataframe'], tokenizer, MAX_LEN)	
 
	training_loader = DataLoader(training_set, **params)
	valid_loader = DataLoader(valid_set, **params)
	test_loader = DataLoader(test_set, **params)

	num_intents = num_intents_dict[data_type]
	model = DistilBERTClass(num_intents)
	model.to(device)

	optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
	scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
	loss_criterion = torch.nn.BCEWithLogitsLoss()
 
	train_loss_list = []
	valid_loss_list = []	

	min_loss = 100.0
	# Train
	print("Training Start")
	for epoch in range(NUM_EPOCH):
		train_loss = 0.0
		model.train()
		for i,data in tqdm(enumerate(training_loader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			targets = data['targets'].to(device, dtype = torch.float)
			outputs = model(ids, mask)
			optimizer.zero_grad()
			loss = loss_criterion(outputs, targets)
			loss.backward()
			optimizer.step()
			train_loss += loss.item()
		scheduler.step()
		train_loss /= i
	
		# Validation
		model.eval()
		probability_outputs=[]
		valid_loss = 0.0
		with torch.no_grad():
			for i, data in tqdm(enumerate(valid_loader, 0), leave=False):
				ids = data['ids'].to(device, dtype = torch.int)
				mask = data['mask'].to(device, dtype = torch.int)
				targets = data['targets'].to(device, dtype = torch.float)
				outputs = model(ids, mask)
				valid_loss += loss_criterion(outputs, targets).item()
			valid_loss /= i
		print(f"Epoch:{epoch+1}, Train Loss: {round(train_loss,4)}, Valid Loss: {round(valid_loss,4)}, lr: {LEARNING_RATE*(0.9)**epoch}")
		train_loss_list.append(round(train_loss,4))
		valid_loss_list.append(round(valid_loss,4))
 
		# # Save model
		# if valid_loss < min_loss:
		# 	min_loss = valid_loss
		# 	count = 0
   
		# 	model_name = f"./models/{data_type}_model_concatone.pth"
		# 	torch.save(model.state_dict(), model_name)
		# 	print(f"{data_type} model saved with valid loss {round(min_loss,4)}")
		# else:
		# 	count += 1
		# 	if count==patience: break
 
	# # Load Best Model
	# model = DistilBERTClass(num_intents)
	# model.load_state_dict(torch.load(f"./models/{data_type}_model_concatone.pth"))
	# model.to(device)

	# # Test with best model
	# print("Test Start")
	# test_loss = 0.0
	# model.eval()
	# probability_outputs=[]
	# with torch.no_grad():
	# 	for i, data in tqdm(enumerate(test_loader, 0), leave=False):
	# 		ids = data['ids'].to(device, dtype = torch.int)
	# 		mask = data['mask'].to(device, dtype = torch.int)
	# 		targets = data['targets'].to(device, dtype = torch.float)
	# 		outputs = model(ids, mask)
	# 		test_loss += loss_criterion(outputs, targets).item()
	# 		probability_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
	# 	test_loss /= i
	# 	print(f"Test loss: {round(test_loss,4)}")
	# probability_outputs = np.array(probability_outputs)
	# #np.save('./probs/' + data_type + '_concat.npy', probability_outputs)
    
	# binary_outputs = (probability_outputs >= 0.5)
	# binary_outputs[np.all(binary_outputs == False, axis=1), -1] = True # 모두 False인 경우 Others로 Labeling

	# # Calculate Result
	# report = classification_report(data_dict[data_type]['test']['label'], binary_outputs, output_dict=True)
	# df_report = pd.DataFrame(report).transpose()
	# df_report.reset_index(inplace=True)
	# df_report.rename(columns={'support': 'tag_count', 'index': 'tag'}, inplace=True)
	# df_report.loc[:num_intents-1, 'tag'] = intents_dict[data_type]
	# df_report[['precision', 'recall', 'f1-score']] = df_report[['precision', 'recall', 'f1-score']].round(4)
	# df_report['tag_count'] = df_report['tag_count'].astype(int)
	# # Save classification report
	# df_report.to_csv(f"./results/{data_type}_concateight.csv", index=False)

	# Loss Plot
	epochs = range(1, len(train_loss_list)+1)

	plt.figure(figsize=(10, 6))
	plt.plot(epochs, train_loss_list, 'o-', label='Train Loss')
	plt.plot(epochs, valid_loss_list, 's-', label='Valid Loss')
	plt.xlabel('Epochs')
	plt.ylabel('Loss')
	plt.title('Train, Valid Loss over Epochs')
	plt.legend()
	#plt.savefig(f"./{data_type}.jpg")
	plt.show()

### 768 벡터로 MLP Layer만 학습하기 (for 부분 고쳐야됨)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

BATCH_SIZE = 64
LEARNING_RATE = 1e-2
NUM_EPOCH = 15
patience = 2
count = 0

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
num_intents_dict = {'user': 8, 'system': 7, 'music': 15}

for data_type in ['music']:
	train_vector = torch.load(f'./distilbert_768vectors/{data_type}_train.pt').to(device)
	val_vector = torch.load(f'./distilbert_768vectors/{data_type}_val.pt').to(device)
	test_vector = torch.load(f'./distilbert_768vectors/{data_type}_test.pt').to(device)
 
	train_y = torch.tensor(data_dict[data_type]['train']['label'], dtype=torch.float32).to(device)
	val_y = torch.tensor(data_dict[data_type]['val']['label'], dtype=torch.float32).to(device)
	test_y = torch.tensor(data_dict[data_type]['test']['label'], dtype=torch.float32).to(device)

	train_dataset = TensorDataset(train_vector, train_y)
	val_dataset = TensorDataset(val_vector, val_y)
	test_dataset = TensorDataset(test_vector, test_y)

	train_dataloader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=False)
	val_dataloader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False)
	test_dataloader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

	num_intents = num_intents_dict[data_type]
	model = MLP_768(num_intents)
	model.to(device)

	optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
	scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
	loss_criterion = torch.nn.BCEWithLogitsLoss()
 
	train_loss_list = []
	valid_loss_list = []	

	min_loss = 100.0
	# Train
	print("Training Start")
	for epoch in range(NUM_EPOCH):
		train_loss = 0.0
		model.train()
  
		for i, data in tqdm(enumerate(train_dataloader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			targets = data['targets'].to(device, dtype = torch.float)
			outputs = model(ids, mask)
			print(outputs.shape)
			outputs = model(outputs)
			optimizer.zero_grad()
			loss = loss_criterion(outputs, targets)
			loss.backward()
			optimizer.step()
			train_loss += loss.item()
		scheduler.step()
		train_loss /= i
	
		# Validation
		model.eval()
		probability_outputs=[]
		valid_loss = 0.0
		with torch.no_grad():
			for i, data in tqdm(enumerate(val_dataloader, 0), leave=False):
				ids = data['ids'].to(device, dtype = torch.int)
				mask = data['mask'].to(device, dtype = torch.int)
				targets = data['targets'].to(device, dtype = torch.float)
				outputs = model(ids, mask)
				valid_loss += loss_criterion(outputs, targets).item()
			valid_loss /= i
		print(f"Epoch:{epoch+1}, Train Loss: {round(train_loss,4)}, Valid Loss: {round(valid_loss,4)}, lr: {LEARNING_RATE*(0.9)**epoch}")
		train_loss_list.append(round(train_loss,4))
		valid_loss_list.append(round(valid_loss,4))
 
		# Save model
		if valid_loss < min_loss:
			min_loss = valid_loss
			count = 0
   
			model_name = f"./models/{data_type}_model_768.pth"
			torch.save(model.state_dict(), model_name)
			print(f"{data_type} model saved with valid loss {round(min_loss,4)}")
		else:
			count += 1
			if count==patience: break

	# Loss Plot
	epochs = range(1, len(train_loss_list)+1)

	plt.figure(figsize=(10, 6))
	plt.plot(epochs, train_loss_list, 'o-', label='Train Loss')
	plt.plot(epochs, valid_loss_list, 's-', label='Valid Loss')
	plt.xlabel('Epochs')
	plt.ylabel('Loss')
	plt.title(f'{data_type} Loss')
	plt.legend()
	#plt.savefig(f"./{data_type}.jpg")
	plt.show()

	# Load Best Model
	model = MLP_768(num_intents)
	model.load_state_dict(torch.load('./models/' + data_type + '_model_768.pth'))
	model.to(device)

	# # Test with threshold = 0.5
	# model.eval()
	# probability_outputs=[]
	# with torch.no_grad():
	# 	for i, (inputs, labels) in tqdm(enumerate(test_dataloader, 0), leave=False):
	# 		outputs = model(inputs)
	# 		probability_outputs.extend(outputs.cpu().detach().numpy().tolist())
	# probability_outputs = np.array(probability_outputs)
   
	# binary_outputs = (probability_outputs >= 0.5)
	# binary_outputs[np.all(binary_outputs == False, axis=1), -1] = True

	# # Calculate Result
	# report = classification_report(data_dict[data_type]['test']['label'], binary_outputs, output_dict=True)
	# df_report = pd.DataFrame(report).transpose()
	# df_report.reset_index(inplace=True)
	# df_report.rename(columns={'support': 'tag_count', 'index': 'tag'}, inplace=True)
	# df_report.loc[:num_intents-1, 'tag'] = intents_dict[data_type]
	# df_report[['precision', 'recall', 'f1-score']] = df_report[['precision', 'recall', 'f1-score']].round(4)

	# print(df_report)

	# Find best threshold
	model.eval()
	labels_list = []
	probability_outputs=[]
	with torch.no_grad():
		for i, data in tqdm(enumerate(val_dataloader, 0), leave=False):
			ids = data['ids'].to(device, dtype = torch.int)
			mask = data['mask'].to(device, dtype = torch.int)
			targets = data['targets'].to(device, dtype = torch.float)
			outputs = model(ids, mask)		
			labels_list.extend(targets.cpu().detach().numpy().tolist())
			probability_outputs.extend(outputs.cpu().detach().numpy().tolist())

	labels = np.array(labels_list)
	probability_outputs = np.array(probability_outputs)

	best_thresholds = np.zeros(num_intents)
	thresholds = [(i+1)/100 for i in range(100)]
	for label_idx in range(num_intents):
		best_label_f1 = 0.0
		for threshold in thresholds:
			binary_preds = (probability_outputs[:, label_idx] >= threshold).astype(int)
			f1 = f1_score(labels[:, label_idx], binary_preds)
			if f1 > best_label_f1:
				best_label_f1 = f1
				best_thresholds[label_idx] = threshold

	print(best_thresholds)

	# Test with best thresholds
	model.eval()
	labels_list = []
	probability_outputs=[]
	for i, data in tqdm(enumerate(test_dataloader, 0), leave=False):
		ids = data['ids'].to(device, dtype = torch.int)
		mask = data['mask'].to(device, dtype = torch.int)
		targets = data['targets'].to(device, dtype = torch.float)
		outputs = model(ids, mask)		
		labels_list.extend(targets.cpu().detach().numpy().tolist())
		probability_outputs.extend(outputs.cpu().detach().numpy().tolist())

	labels = np.array(labels_list)
	probability_outputs = np.array(probability_outputs)
 
	binary_outputs = (probability_outputs >= best_thresholds).astype(int)
	binary_outputs[np.all(binary_outputs == False, axis=1), -1] = True
 
	report = classification_report(labels[:, :-1], binary_outputs[:, :-1], output_dict=True)
	df_report = pd.DataFrame(report).transpose()
	df_report.reset_index(inplace=True)
	df_report.rename(columns={'support': 'tag_count', 'index': 'tag'}, inplace=True)
	df_report.loc[:num_intents-2, 'tag'] = intents_dict[data_type][:-1]
	df_report[['precision', 'recall', 'f1-score']] = df_report[['precision', 'recall', 'f1-score']].round(2)
	df_report = df_report[df_report['tag'].apply(lambda x: x not in ['micro avg', 'weighted avg', 'samples avg'])]

	print(df_report)