In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
import pandas as pd

import random
import os
from tqdm import tqdm, trange
from pprint import pprint



In [2]:
## Seed를 설정합니다.
## https://m.blog.naver.com/vail131/222306329719 참조

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [3]:
## genre에 저장되어있는 숫자와 실제 장르 명의 대응관계입니다.

## 모델을 어떻게 학습시켰는가에 따라, test datset에 존재하는 장르만 사용하도록
## id2label, label2 id, num_classes를 변경해도 괜찮습니다.

## e.g.
id2label = {'발라드': 0, '록/메탈': 1, '댄스': 2, 'POP': 4,'R&B/Soul': 5, '랩/힙합': 7, '성인가요/트로트': 16}
label2id = {value: key for key, value in id2label.items()}
num_classes = 7


# id2label = {
#     '발라드': 0, '록/메탈': 1, '댄스': 2, '포크/블루스': 3,'POP': 4,'R&B/Soul': 5, '-': 6,
#     '랩/힙합': 7, '국내영화': 8, 'CCM': 9, '국내CCM': 10, '클래식': 11,'오페라/성악': 12, '국외영화': 13,
#     '국내드라마': 14, '인디음악': 15, '성인가요/트로트': 16, '일렉트로니카': 17, '뉴에이지': 18,
#     '키즈': 19, '창작동요': 20, '크로스오버': 21, '재즈': 22, '보컬재즈': 23, '애시드/퓨전/팝': 24,
#     '월드뮤직': 25, '애니메이션/웹툰': 26, '만화': 27, '게임': 28, 'J-POP': 29, '중국음악': 30,
#     '샹송/프렌치팝': 31, '컨트리': 32, '국내뮤지컬': 33, '블루스': 34, '국외CCM': 35,
#     '포크': 36, '국외뮤지컬': 37, '가톨릭': 38, '국외드라마': 39, '브라질': 40, '뮤직테라피': 41,
#     '보사노바': 42, '라틴': 43, '국악': 44, '국악가요': 45, '찬송가': 46, '영어동요': 47, '워십': 48,
#     '불교': 49,'자장가': 50, '민요': 51}
# label2id = {value: key for key, value in id2label.items()}
# num_classes = len(id2label)

print(f"num_classes: {num_classes}")

num_classes: 7


# 2. DataLoader & Path 설정

In [4]:
## test df를 불러옵니다.
test_path = '/kaggle/input/dataset/test.csv'
test_ = pd.read_csv(test_path)

train_path = '/kaggle/input/dataset/train.csv'
train_ = pd.read_csv(train_path)

In [5]:
## 첫 열의 이름을 'id'로 바꾸어줍니다.
test_df = test_.rename(columns=({'Unnamed: 0':'id'}))
print(test_df.shape)
test_df.head()

(10500, 7)


Unnamed: 0,id,song_name,adults,artist,album_id,date,lyrics
0,33093058,TYPE (feat. 기린),0,슬리피,'10524862',2020.11.27,괜히 생각나 네 말이\n너와 가고 싶어 저 멀리\n너도 그래\n아주 그래 BABE\...
1,2388517,Paparazzi,0,Lady Gaga,'741056',2009.12.09,We are the crowd\nWe're cuh coming out\nGot my...
2,31013021,위하여,0,조항조,'10157289',2018.04.12,쓰디쓴 술잔을 비우고 또 비워봐도 \n내 가슴속 너만은 비울 수가 없구나 \n안녕이...
3,34801938,붉어지기도 전에 떨어진…,0,박완규,'10899614',2022.02.25,갈망하는 노을처럼\n한 줌의 재가 되어 고운\n홍조 오르기도 전에 떨어져\n그 자리...
4,4061054,Strawberry Bubblegum,0,Justin Timberlake,'2178525',2013.03.15,Hey pretty lady\nThis goes out to you\nI know ...


In [6]:
train_df = train_.rename(columns=({'Unnamed: 0':'id'}))

print(train_df.shape)
train_df.head()

(135792, 8)


Unnamed: 0,id,song_name,adults,artist,album_id,date,genre,lyrics
0,2395,긴 잠,0,INO,'1869',2001.03.16,0,잠을 깨는 것이 싫었어\n눈 뜨면 또 하루\n니 곁을 살테니\n오직 내꿈 속엔 넌 ...
1,2396,Alcatraz,0,INO,'1869',2001.03.16,1,이젠 널 가둬놓겠어\n나의 품에\n조금은 낯설겠지만\n편해질꺼야\n두려운 내 맘 때...
2,2397,해요,0,INO,'1869',2001.03.16,0,그녀와 나는요 그땐 참 어렸어요\n많이 사랑했고 때론 많이 다퉜었죠\n지금 생각하면...
3,2398,투비(鬪悲),0,INO,'1869',2001.03.16,0,예전처럼 다시 처음으로\n서로 몰랐던 때로 돌아가\n쉽진 않지만\n부탁이야 잊어줘\...
4,2399,Dying Love,0,INO,'1869',2001.03.16,"0, 1",천번을 하늘 앞에 다짐해왔어\n무참히 날 버렸던\n너를 지우겠다고\n하지만 그리움이...


In [7]:
import re

def has_korean(text):
    # 정규식 패턴을 사용하여 한글 문자가 있는지 판별
    korean_pattern = re.compile('[ㄱ-ㅎㅏ-ㅣ가-힣]')
    return bool(korean_pattern.search(text))

In [8]:
def find_kor(data):
  has_kor = []
  for song_name, artist, lyrics in zip(train_df['song_name'],train_df['artist'],train_df['lyrics']):
    has_kor.append(has_korean(lyrics+song_name+artist))

  return has_kor

In [9]:
def genre_fix(data):
  extracted = data[(data['genre'] == '0') | (data['genre'] == '1') | (data['genre'] == '2') | (data['genre'] == '4') \
          | (data['genre'] == '5') | (data['genre'] == '7') | (data['genre'] == '16')]

  return extracted

In [10]:
train_df['has_kor'] = pd.DataFrame(find_kor(train_df))
test_df['has_kor'] = pd.DataFrame(find_kor(test_df))

In [11]:
train_df = genre_fix(train_df)

In [12]:
train_df = train_df.astype({'genre':'int'})

In [13]:
train_df.loc[train_df['genre'] == 4, 'genre'] = 3
train_df.loc[train_df['genre'] == 5, 'genre'] = 4
train_df.loc[train_df['genre'] == 7, 'genre'] = 5
train_df.loc[train_df['genre'] == 16, 'genre'] = 6

In [17]:
train_df[train_df['has_kor']==False]

Unnamed: 0,id,song_name,adults,artist,album_id,date,genre,lyrics,has_kor
57,3347,My Private Movie,0,Westlife,'80',2001.02.13,3,\n\n[Spoken]:\n(yeah)(check it out)\n[Shane] :...,False
186,6894,Cinderella,0,Sweetbox,'2024',2001.04.01,3,Cinderella are you really that happy\nCinderel...,False
187,6895,For The Lonely,0,Sweetbox,'2024',2001.04.01,3,This is for the lonely\nThis is for the lonely...,False
188,6896,Boyfriend,0,Sweetbox,'2024',2001.04.01,3,Always hatin' on me\nwhen you talk talk talk\n...,False
189,6897,How Does It Feel,0,Sweetbox,'2024',2001.04.01,3,Look at the me that silly girl\nGave all I had...,False
...,...,...,...,...,...,...,...,...,...
135480,36102766,Before I Met You,0,J.Fla,'11164457',2023.02.03,0,Before I fell in love with you\nI didn’t know ...,False
135592,36105794,Drown In You(Feat. Heezy),0,Andy. J,'11165231',2023.02.06,4,Maybe I had a drink too much\nBut I remember w...,False
135594,36106320,Wanderland (Feat. Shelhiel),0,Blish,'11165332',2023.02.03,4,"So what now \nFace it, like it \nI want to giv...",False
135649,36108763,Neverland,0,eiji,'11165886',2023.02.04,4,How can I confess \nyou look better in the sun...,False


In [18]:
train_df['genre']

0         0
1         1
2         0
3         0
5         0
         ..
135784    0
135785    1
135787    2
135788    2
135789    4
Name: genre, Length: 94743, dtype: int64

## dataset class 정의

In [19]:
## dataset class를 정의합니다.
class SongDatasetTest(Dataset):
    def __init__(self, df):
        df = df.reset_index(drop=True)
        self.song_names = df["song_name"]
        self.artists = df["artist"]
        self.lyrics = df["lyrics"]
        self.id = df["id"]
        self.has_kor = df["has_kor"]

    def __getitem__(self, idx):
        song_name = self.song_names[idx]
        artist = self.artists[idx]
        lyrics = self.lyrics[idx]
        id = self.id[idx]

        return {'song_name': song_name,
                'artist': artist,
                'lyrics': lyrics,
                'id': id}

    def __len__(self):
        return len(self.song_names)

In [20]:
## test_dataset과 dataloader를 정의합니다.
test_datasets = SongDatasetTest(test_df)

## Inference 시에는 shuffle=False로 설정해야 출력 순서가 바뀌지 않아 제대로 된 채점이 가능합니다.
## 단, training 시에는 shuffle=True로 설정해주는 것이 좋습니다.
BATCH_SIZE_TEST = 256
test_dataloader = DataLoader(test_datasets, batch_size = BATCH_SIZE_TEST, shuffle = False)
test_datasets[5515]

{'song_name': 'Pray',
 'artist': '더지타 (The GITA)',
 'lyrics': '잠들 수도 없었던 아침에\n머리 속은 기억 안에 있네\n돌아가서 하나씩 떠올려\n돌이킬 수 없다는 걸 알면서도\nI pray\nI pray\nI pray\nI pray\n널 위해서 I pray\n날 위해서 I pray\n언제든지 I pray\n잊지 말고 I pray\n한 때는 나만을 생각했고\n언제나 너만을 원망했어\n그럴 때 넌 내 옆을 지켜줬고\n그 순간 난 나만을 지켰었어\n왜 그랬을까 하는 후회도\n넌 어땠을까 하는 고민도\n소용없겠지만\nI pray\nI pray\nI pray\nI pray\n널 위해서 I pray\n날 위해서 I pray\n언제든지 I pray\n잊지 말고 I pray\n',
 'id': 32595991}

# Model

In [21]:
!pip install transformers



In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('using device: ', device)

using device:  cuda


In [23]:
from transformers import BertForSequenceClassification, DistilBertTokenizerFast

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [24]:
train_df

Unnamed: 0,id,song_name,adults,artist,album_id,date,genre,lyrics,has_kor
0,2395,긴 잠,0,INO,'1869',2001.03.16,0,잠을 깨는 것이 싫었어\n눈 뜨면 또 하루\n니 곁을 살테니\n오직 내꿈 속엔 넌 ...,True
1,2396,Alcatraz,0,INO,'1869',2001.03.16,1,이젠 널 가둬놓겠어\n나의 품에\n조금은 낯설겠지만\n편해질꺼야\n두려운 내 맘 때...,True
2,2397,해요,0,INO,'1869',2001.03.16,0,그녀와 나는요 그땐 참 어렸어요\n많이 사랑했고 때론 많이 다퉜었죠\n지금 생각하면...,True
3,2398,투비(鬪悲),0,INO,'1869',2001.03.16,0,예전처럼 다시 처음으로\n서로 몰랐던 때로 돌아가\n쉽진 않지만\n부탁이야 잊어줘\...,True
5,2401,니가 가르쳐준 것들,0,INO,'1869',2001.03.16,0,가끔 힘든 걸음으로 나\n너를 찾아가\n너의 품에 기대 울곤 했지\n그때 너는 네게...,True
...,...,...,...,...,...,...,...,...,...
135784,36110233,바래 (feat. 신현우),0,신지현,'11166283',2023.02.06,0,나를 만난 날들은 진심이었길 바래\n누굴 만나 뭘 하든지 행복하길 바래\n내 사진 ...,True
135785,36110235,파도,0,백찬열,'11166284',2023.02.06,1,환하던 세상엔 어두운 밤이 내려오고\n반짝이던 해는 먹구름에 가려지고\n푸르른 하늘...,True
135787,36110996,파이팅 해야지 (Feat. 이영지),0,부석순 (SEVENTEEN),'11166539',2023.02.06,2,파이팅 해야지\n파이팅\n아뿔싸 일어나야지 아침인데\n눈 감았다 뜨니 해가 중천인데...,True
135788,36110997,LUNCH,0,부석순 (SEVENTEEN),'11166539',2023.02.06,2,나는 지구 넌 어느 별 다른 우주 상관없어\n같은 타임라인 속에 우리 평행하게 있다...,True


In [25]:
from tqdm import tqdm
import time


# ver 2

In [34]:
## dataset class를 정의합니다.
class SongDatasetTest(Dataset):
    def __init__(self, df):
        df = df.reset_index(drop=True)
        self.song_names = df["song_name"]
        self.artists = df["artist"]
        self.lyrics = df["lyrics"]
        self.id = df["id"]
        self.genre = df["genre"]
        self.korean = df['has_kor']

    def __getitem__(self, idx):
        song_name = self.song_names[idx]
        artist = self.artists[idx]
        lyrics = self.lyrics[idx]
        id = self.id[idx]
        label = self.genre[idx]
        has_kor = 'Korean' if self.korean[idx] else 'English'
        

        return {'song_name': song_name,
                'artist': artist,
                'lyrics': lyrics,
                'id': id,
               'has_kor' : has_kor}, label

    def __len__(self):
        return len(self.song_names)

In [35]:
train_df['genre'].unique

<bound method Series.unique of 0         0
1         1
2         0
3         0
5         0
         ..
135784    0
135785    1
135787    2
135788    2
135789    4
Name: genre, Length: 94743, dtype: int64>

In [36]:
## test_dataset과 dataloader를 정의합니다.
train_datasets = SongDatasetTest(train_df)

## Inference 시에는 shuffle=False로 설정해야 출력 순서가 바뀌지 않아 제대로 된 채점이 가능합니다.
## 단, training 시에는 shuffle=True로 설정해주는 것이 좋습니다.
BATCH_SIZE_TEST = 64
train_dataloader = DataLoader(train_datasets, batch_size = BATCH_SIZE_TEST, shuffle = False)
train_datasets[5515]

({'song_name': 'Overlap',
  'artist': '컨츄리 꼬꼬',
  'lyrics': '오 그렇게 울지는 마\n더는 미안하지 않아도 돼\n돌아가고 싶던 널 그에게로\n이제는 보낼께\n둘이서라면 모든 걸\n포기했던 너잖아\n이대로는 너를 잡아둘 수 없는걸\n늘 너의 눈빛은\n나 아닌 그의 모습을\n그리워만 하면서도\n웃고 있는거야\n더 오랜 시간을 참고 기다린대도\n달라지지 않은 너를\n이젠 떠날꺼야\n나의 눈물\n눈물 닦아 주려 하지 말아줘\n오랜 시간\n혼자 너의 손길 그릴테니\n끝내 너를\n너를 잊을수는 없을지 몰라\n이젠 그에게 돌아가\n남겨진 내 걱정은 하지마\n\n오 거짓말하지는 마\n이젠 고민하지 않아도 돼\n사랑하고 싶던 널\n기억에서 이제는 지우려 해\n널 위해서라면\n죽음도 두렵지는 않지만\n이대로는 너를 사랑할순 없는 걸\n단 한 번이라도\n너 아닌 다른 사람을\n사랑한적 없는 난 혼자 남는거야\n널 버리려 했던\n그가 난 정말 싫지만\n니가 행복하다면 그걸로 끝인 거야\n내게 다시\n다시 돌아오려 하지 말아줘\n그와 함께\n이별없는 사랑하게 될테니\n이제 하루\n하루 아파해야 할지도 몰라\n제발 뒤돌아 보지마\n달려가 널 잡을지도 몰라\n나의 눈물\n눈물 닦아 주려 하지 말아줘\n오랜 시간\n혼자 너의 손길 그릴테니\n끝내 너를\n너를 잊을 수는 없을지 몰라\n이제 그에게 돌아가\n남겨진 내 걱정은 하지마\n달려가 널 잡을지도 몰라\n \n',
  'id': 90067,
  'has_kor': 'Korean'},
 2)

In [29]:
from transformers import BertForSequenceClassification, DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) ## model used in training phase
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_classes)

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

In [31]:
import torch.optim as optim

In [32]:
result = []

optimizer = optim.Adam(model.parameters(), lr=1e-5)

itr = 1
p_itr = 50
epochs = 3
total_loss = 0
total_len = 0
total_correct = 0

model.train()
for epoch in range(epochs):

#  test_correct_predictions, test_incorrect_predictions, count = 0, 0, 1
  for data, label in tqdm(train_dataloader):
        optimizer.zero_grad()

        lyrics = data['lyrics']
        song_name = data['song_name']
        artist = data['artist']
        has_kor = data['has_kor']
        
        inputs = song_name + '[]' + has_kor
        encoded_inputs = tokenizer(lyrics, return_tensors='pt', padding='max_length', truncation=True, max_length=100).to(device)

        label = label.to(device)
        model = model.to(device)

        outputs = model(**encoded_inputs, labels=label)
        loss = outputs.loss
        logits = outputs.logits

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(label)
        total_correct += correct.sum().item()
        total_len += len(label)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1

      # for i, output in enumerate(outputs):
      #     song = data['song_name'][i]
      #     artist = data['artist'][i]
      #     predicted = torch.topk(output, 1).indices.item()
      #     id = data['id'][i].item()


      #     output_for_print = {
      #       "Song": song,
      #       "Artist": artist,
      #       "Predicted": label2id[predicted],
      #       "Id": str(id)
      #     }

      #     output_for_submission = {
      #       "id": id,
      #       "genre": predicted
      #     }

      #     result.append(output_for_submission)

      #     print(f"{count} / {len(test_datasets)}")
      #     count += 1


  pred = torch.argmax(F.softmax(logits), dim=1)
  3%|▎         | 50/1481 [00:50<24:38,  1.03s/it]

[Epoch 1/3] Iteration 50 -> Train Loss: 1.6046, Accuracy: 0.423


  7%|▋         | 100/1481 [01:44<26:07,  1.14s/it]

[Epoch 1/3] Iteration 100 -> Train Loss: 1.4367, Accuracy: 0.453


 10%|█         | 150/1481 [02:42<25:02,  1.13s/it]

[Epoch 1/3] Iteration 150 -> Train Loss: 1.6454, Accuracy: 0.375


 14%|█▎        | 200/1481 [03:38<24:30,  1.15s/it]

[Epoch 1/3] Iteration 200 -> Train Loss: 1.5352, Accuracy: 0.463


 17%|█▋        | 250/1481 [04:35<23:19,  1.14s/it]

[Epoch 1/3] Iteration 250 -> Train Loss: 1.4584, Accuracy: 0.479


 20%|██        | 300/1481 [05:32<22:23,  1.14s/it]

[Epoch 1/3] Iteration 300 -> Train Loss: 1.4466, Accuracy: 0.462


 24%|██▎       | 350/1481 [06:29<21:29,  1.14s/it]

[Epoch 1/3] Iteration 350 -> Train Loss: 1.4949, Accuracy: 0.417


 27%|██▋       | 400/1481 [07:26<20:33,  1.14s/it]

[Epoch 1/3] Iteration 400 -> Train Loss: 1.3318, Accuracy: 0.519


 30%|███       | 450/1481 [08:23<19:38,  1.14s/it]

[Epoch 1/3] Iteration 450 -> Train Loss: 1.2640, Accuracy: 0.546


 34%|███▍      | 500/1481 [09:20<18:37,  1.14s/it]

[Epoch 1/3] Iteration 500 -> Train Loss: 1.1810, Accuracy: 0.587


 37%|███▋      | 550/1481 [10:17<17:38,  1.14s/it]

[Epoch 1/3] Iteration 550 -> Train Loss: 1.2891, Accuracy: 0.529


 40%|███▉      | 587/1481 [11:00<16:45,  1.13s/it]


In [53]:
torch.save(model.state_dict(), '/kaggle/working/model_base')

In [55]:
class SongDatasetTest_2(Dataset):
    def __init__(self, df):
        df = df.reset_index(drop=True)
        self.song_names = df["song_name"]
        self.artists = df["artist"]
        self.lyrics = df["lyrics"]
        self.id = df["id"]

    def __getitem__(self, idx):
        song_name = self.song_names[idx]
        artist = self.artists[idx]
        lyrics = self.lyrics[idx]
        id = self.id[idx]

        return {'song_name': song_name,
                'artist': artist,
                'lyrics': lyrics,
                'id': id}

    def __len__(self):
        return len(self.song_names)

In [56]:
from transformers import BertForSequenceClassification, DistilBertTokenizerFast
MODEL_PATH = '/kaggle/working/model_base' ## model weight path
MODEL_NAME = 'bert-base-multilingual-cased' ## model name
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME, do_lower_case=False) ## model used in training phase
test_model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_classes).to(device)  ## tokenizer used in training phase

### 저장된 모델의 state_dict를 불러옵니다.
device = "cuda"
test_model.load_state_dict(torch.load(MODEL_PATH))
test_model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClas

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [57]:
label_convert = {0 : 0, 1 : 1, 2 : 2, 3 : 4, 4 : 5, 5 : 7, 6 : 16}

In [58]:
result = []
with torch.no_grad():
    test_correct_predictions, test_incorrect_predictions, count = 0, 0, 1
    for data in tqdm(test_dataloader):
        inputs = data['lyrics']
        encoded_inputs = tokenizer(inputs, return_tensors='pt', padding='max_length', truncation=True, max_length=100).to(device)
        test_model = test_model.to(device)

        test_model.eval()
        outputs = test_model(**encoded_inputs).logits
        _, predicted = torch.max(outputs, 1)

        for i, output in enumerate(outputs):
            song = data['song_name'][i]
            artist = data['artist'][i]
            predicted = torch.topk(output, 1).indices.item()
            
            predicted = label_convert[predicted]
            
            id = data['id'][i].item()
            
            
            output_for_print = {
              "Song": song,
              "Artist": artist,
              "Predicted": label2id[predicted],
              "Id": str(id)
            }

            output_for_submission = {
              "id": id,
              "genre": predicted
            }

            result.append(output_for_submission)

            #print(f"{count} / {len(test_datasets)}")
            count += 1
            
            ### 단순 결과 출력용입니다. 제거해도 괜찮습니다.
            #pprint(output_for_print, sort_dicts=False)


100%|██████████| 42/42 [01:01<00:00,  1.47s/it]


In [59]:
df_test = pd.DataFrame(result)
RESULT_PATH = f'/kaggle/working/prediction_1.csv'
df_test.to_csv(RESULT_PATH, index=False)
df_test

Unnamed: 0,id,genre
0,33093058,7
1,2388517,4
2,31013021,16
3,34801938,0
4,4061054,4
...,...,...
10495,897937,0
10496,32362102,7
10497,354645,4
10498,33367298,5
