In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc
from tqdm import tqdm

# Transformers
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# 시드 고정

def seed_everything(seed: int=42):
  random.seed(seed)
  np.random.seed(seed)
  os.environ['PYTHONASHSEED'] = str(seed)

my_seed = 42
seed_everything(my_seed)

# 파일 경로 설정
review_path = "balanced_lyrics.csv"

In [None]:
#BERT 토크나이저 및 모델 불러오기
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
def Tokenize(text_list, tokenizer, model, batch_size: int):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    model = model.to(device).eval() 
    embeddings = []

    for i in tqdm(range(0, len(text_list), batch_size)): # 배치사이즈는 꼭 팀원 모두 동일하게 진행할것
        batch_texts = text_list[i:i + batch_size] 

        inputs = tokenizer(
            batch_texts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu())

        torch.cuda.empty_cache()

    all_embeddings = torch.cat(embeddings, dim=0).numpy().tolist()
    del inputs, outputs, cls_embeddings
    gc.collect()
    torch.cuda.empty_cache()

    return all_embeddings

In [6]:
def bert(df, column_name='text', batch_size=1):

    print("BERT Embedding Start")
    bert_emb = Tokenize(df[column_name].tolist(), bert_tokenizer, bert_model, batch_size)

    df['bert_emb'] = bert_emb
    return df

In [17]:
balance_df = pd.read_csv(review_path , index_col=0)
print(balance_df.shape)
balance_df.head()

(104098, 18)


Unnamed: 0,track_uri,track_name,artist,user_id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics,like
0,0qCQg5TkfBfkTsQP3IhAmC,Southside Of Heaven,Ryan Bingham,95797a27c7ec61a04f0c9437448eee7233e8a456,379160,0.387,0.499,7,-10.207,1,0.0259,0.0029,0.00373,0.0695,0.333,111.032,"When I die, Lord, oh, won't you put my soul up...",0
1,5B3UjDMiCwWFgnXqt0isd5,Martin Sheen Or JFK,Yellowcard,2e91726a1f802c680a8499544edc3561ce50b965,226746,0.256,0.756,2,-5.395,1,0.039,0.000128,0.0,0.116,0.268,197.512,Here it is\n One more glass for these broken h...,0
2,23qnota5Iyg4f9LNdNIC1D,Televators,The Mars Volta,aafd6caf4b6dad692dd93623cc5c4e5b5a919600,378786,0.306,0.541,11,-8.195,0,0.0383,0.457,6e-06,0.122,0.183,122.041,Just as he hit\n The ground\n They lowered a t...,0
3,2sy0icOIskeP2lCqgZiTyE,Talk Show Host,Radiohead,28b78e314a00f86e24ae7ab1e40392861fbdc5a1,281000,0.535,0.479,5,-14.112,0,0.0311,0.277,0.0285,0.139,0.504,88.841,I want to\n I want to be someone else or I'll ...,0
4,19olraJp56OeGTQSZwLZc1,Dying Inside,Saint Vitus,8c5d4fe93b5eebbbbe5b80fdac548920c08c395c,445126,0.313,0.406,1,-14.754,1,0.0347,0.0278,0.000437,0.125,0.218,112.856,I have got to change my ways\n 'Cause I'm losi...,0


In [10]:
# Str로 타입 변경
balance_df['lyrics'] = balance_df['lyrics'].astype("str")

In [12]:
# 컴퓨터 성능에 따라, 파트 분담에 따라 조절 
# 0:5000, 5000:10000 이런 식 (0~4999 5천개, 5000~9999 5천개 ...) 
start = 0
end = 104098
chunk_size = 5000
num = 1

while start < end:
    chunk_end = min(start + chunk_size, end)
    name = f"df{num}"
    
    print(f"\n==== {name} 임베딩 시작 ({start} ~ {chunk_end}) ====")
    sub_df = balance_df[start:chunk_end].copy()
    
    sub_df = bert(sub_df, column_name='lyrics', batch_size=1)

    save_path = f"./lyrics_embeds/lyrics_emb_{name}.csv"  # 저장 경로 설정
    sub_df.to_csv(save_path)
    
    print(f"{save_path} 저장 완료")
    
    start += chunk_size
    num += 1


==== df1 임베딩 시작 (0 ~ 5000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:21<00:00,  6.75it/s]


./lyrics_embeds/lyrics_emb_df1.csv 저장 완료

==== df2 임베딩 시작 (5000 ~ 10000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:44<00:00,  6.54it/s]


./lyrics_embeds/lyrics_emb_df2.csv 저장 완료

==== df3 임베딩 시작 (10000 ~ 15000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:55<00:00,  6.45it/s]


./lyrics_embeds/lyrics_emb_df3.csv 저장 완료

==== df4 임베딩 시작 (15000 ~ 20000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:45<00:00,  6.53it/s]


./lyrics_embeds/lyrics_emb_df4.csv 저장 완료

==== df5 임베딩 시작 (20000 ~ 25000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:53<00:00,  6.47it/s]


./lyrics_embeds/lyrics_emb_df5.csv 저장 완료

==== df6 임베딩 시작 (25000 ~ 30000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:45<00:00,  6.53it/s]


./lyrics_embeds/lyrics_emb_df6.csv 저장 완료

==== df7 임베딩 시작 (30000 ~ 35000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:46<00:00,  6.52it/s]


./lyrics_embeds/lyrics_emb_df7.csv 저장 완료

==== df8 임베딩 시작 (35000 ~ 40000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [10:42<00:00,  7.79it/s]


./lyrics_embeds/lyrics_emb_df8.csv 저장 완료

==== df9 임베딩 시작 (40000 ~ 45000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [10:15<00:00,  8.12it/s]


./lyrics_embeds/lyrics_emb_df9.csv 저장 완료

==== df10 임베딩 시작 (45000 ~ 50000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [10:15<00:00,  8.13it/s]


./lyrics_embeds/lyrics_emb_df10.csv 저장 완료

==== df11 임베딩 시작 (50000 ~ 55000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:10<00:00,  6.84it/s]


./lyrics_embeds/lyrics_emb_df11.csv 저장 완료

==== df12 임베딩 시작 (55000 ~ 60000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [11:02<00:00,  7.54it/s]


./lyrics_embeds/lyrics_emb_df12.csv 저장 완료

==== df13 임베딩 시작 (60000 ~ 65000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [10:22<00:00,  8.03it/s]


./lyrics_embeds/lyrics_emb_df13.csv 저장 완료

==== df14 임베딩 시작 (65000 ~ 70000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [12:30<00:00,  6.66it/s]


./lyrics_embeds/lyrics_emb_df14.csv 저장 완료

==== df15 임베딩 시작 (70000 ~ 75000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [10:14<00:00,  8.14it/s]


./lyrics_embeds/lyrics_emb_df15.csv 저장 완료

==== df16 임베딩 시작 (75000 ~ 80000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [09:55<00:00,  8.39it/s]


./lyrics_embeds/lyrics_emb_df16.csv 저장 완료

==== df17 임베딩 시작 (80000 ~ 85000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [11:51<00:00,  7.03it/s]


./lyrics_embeds/lyrics_emb_df17.csv 저장 완료

==== df18 임베딩 시작 (85000 ~ 90000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [11:15<00:00,  7.40it/s]


./lyrics_embeds/lyrics_emb_df18.csv 저장 완료

==== df19 임베딩 시작 (90000 ~ 95000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [10:21<00:00,  8.05it/s]


./lyrics_embeds/lyrics_emb_df19.csv 저장 완료

==== df20 임베딩 시작 (95000 ~ 100000) ====
BERT Embedding Start


100%|██████████| 5000/5000 [10:15<00:00,  8.12it/s]


./lyrics_embeds/lyrics_emb_df20.csv 저장 완료

==== df21 임베딩 시작 (100000 ~ 104098) ====
BERT Embedding Start


100%|██████████| 4098/4098 [08:03<00:00,  8.47it/s]


./lyrics_embeds/lyrics_emb_df21.csv 저장 완료


In [7]:
# 만든 csv 파일들 합치기

all = []
for i in range(1, 22):
    df_part = pd.read_csv(f"./lyrics_embeds/lyrics_emb_df{i}.csv", index_col=0)
    all.append(df_part)

df_full = pd.concat(all).reset_index(drop=True)

In [None]:
# 합친파일 내보내기

save_path = f"./lyrics_embeds/emb_all_.csv"
df_full.to_csv(save_path)
print(f"{save_path} 저장 완료")

./lyrics_embeds/emb_all.csv 저장 완료
