In [3]:
import numpy as np
import pandas as pd
import json

In [4]:
with open("./result/popular.json","r", encoding= 'utf-8')as f:
    data = json.load(f)


In [16]:
df = pd.DataFrame(data['movies'])
df.columns

Index(['adult', 'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count'],
      dtype='object')

In [17]:
df.head(1)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/8J6UlIFcU7eZfq9iCLbgc8Auklg.jpg,"[14, 10751, 28]",1087192,en,How to Train Your Dragon,수백년간 지속되어온 바이킹과 드래곤의 전쟁. 드래곤을 없애는 것이 삶의 모든 목적인...,682.6797,/8vywrRg1wrY4fo7EqgrFmUJgchG.jpg,2025-06-06,드래곤 길들이기,False,8.1,1348


In [20]:
df["genre_ids"]

# 임베딩

0            [14, 10751, 28]
1           [16, 28, 14, 53]
2       [10751, 878, 35, 12]
3              [28, 878, 53]
4                       [35]
                ...         
9992                    [99]
9993         [35, 18, 10749]
9994                [35, 18]
9995            [53, 80, 27]
9996                    [37]
Name: genre_ids, Length: 9997, dtype: object

In [21]:
import torch
print(torch.__version__)
print("CUDA 사용 가능:", torch.cuda.is_available())

2.1.0
CUDA 사용 가능: True


In [30]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from collections import defaultdict

class GenreEmbeddingModule(nn.Module):
    def __init__(self, genre_id_set, emb_dim=32):
        super().__init__()

        # 장르 인덱싱 + UNK
        genre2idx = {g: idx + 1 for idx, g in enumerate(sorted(genre_id_set))}  # 1부터 시작
        genre2idx['UNK'] = 0  # 0번은 패딩/UNK 용
        self.genre2idx = defaultdict(lambda: 0, genre2idx)  # default to UNK

        self.embedding = nn.Embedding(num_embeddings=len(genre2idx), embedding_dim=emb_dim, padding_idx=0)

    def forward(self, genre_ids_batch):
        """
        genre_ids_batch: List[List[int]]
        Returns: Tensor [batch_size, emb_dim]
        """
        # 인덱스 매핑
        mapped_ids = [[self.genre2idx[g] for g in row] for row in genre_ids_batch]
        mapped_tensors = [torch.tensor(row, dtype=torch.long) for row in mapped_ids]

        # 패딩 적용
        padded = rnn_utils.pad_sequence(mapped_tensors, batch_first=True)  # [batch, max_len]
        device = self.embedding.weight.device
        padded = padded.to(device)

        # 임베딩
        emb = self.embedding(padded)  # [batch, max_len, emb_dim]

        # 마스크를 이용한 평균
        mask = (padded != 0).unsqueeze(-1)        # [batch, max_len, 1]
        masked = emb * mask                       # [batch, max_len, emb_dim]
        summed = masked.sum(dim=1)                # [batch, emb_dim]
        count = mask.sum(dim=1).clamp(min=1)      # [batch, 1]
        mean_emb = summed / count                 # [batch, emb_dim]

        return mean_emb


In [31]:
# ✅ 장르 집합 정의
genre_set = set(g for row in df['genre_ids'] for g in row)

# ✅ 모델 정의 및 GPU로 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GenreEmbeddingModule(genre_set, emb_dim=32).to(device)

# ✅ 임베딩 실행
genre_vecs = model(df['genre_ids'].tolist())  # [batch, emb_dim]
print("Genre Embedding Shape:", genre_vecs.shape)
print("예시:\n", genre_vecs[:2])

Genre Embedding Shape: torch.Size([9997, 32])
예시:
 tensor([[-0.0675,  0.0464, -0.0231, -0.1961,  0.9570, -1.4395, -0.3187,  0.1094,
          0.3804,  0.2982, -0.3163, -0.0185,  0.2231,  0.3647,  0.0642, -0.5144,
         -0.4472,  0.3527,  0.6247, -1.7262,  0.4082,  1.2975,  0.2890,  0.5945,
         -0.1189, -0.3098, -0.1879, -0.0262,  0.8080, -0.6914, -0.6762, -0.1737],
        [-0.7604, -0.0712,  0.1615, -0.4516,  0.3063, -0.9676, -0.4987, -0.0731,
          1.3108, -0.0363, -0.1835, -0.8307, -0.0855, -0.0871, -0.9562, -0.6055,
         -0.2573,  0.0162, -0.2572, -0.7812, -0.1089,  1.2441,  0.1315,  0.0052,
         -0.3043, -0.7218,  0.1731,  0.0220, -0.0273,  0.2439, -0.2335, -0.6410]],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [46]:
genre_emb_df = pd.DataFrame(genre_vecs.cpu().detach().numpy(), columns = [f"emb_{i}" for i in range(genre_vecs.shape[1])])

In [47]:
genre_emb_df

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31
0,-0.067541,0.046363,-0.023060,-0.196135,0.957031,-1.439456,-0.318664,0.109393,0.380443,0.298166,...,0.288983,0.594515,-0.118905,-0.309777,-0.187888,-0.026216,0.808040,-0.691377,-0.676238,-0.173697
1,-0.760369,-0.071173,0.161451,-0.451623,0.306316,-0.967556,-0.498682,-0.073078,1.310758,-0.036324,...,0.131476,0.005180,-0.304304,-0.721784,0.173137,0.021988,-0.027327,0.243930,-0.233470,-0.640953
2,1.247500,-0.342967,0.002581,-0.064039,-0.595275,-0.554840,0.190544,-0.622862,-0.538322,0.527653,...,0.700922,-0.546264,0.017262,-0.782922,-0.105282,0.118202,0.396109,0.296547,-0.516222,-0.266299
3,0.725913,0.490650,-0.238426,-0.245266,-0.355309,-1.118051,-0.985181,-0.448578,0.616038,0.137767,...,0.626577,-0.440025,-0.997606,-1.353975,0.095489,0.107502,-0.318818,0.890287,-0.757823,0.220186
4,1.088601,-2.512655,-0.985147,0.872821,-0.597138,-0.209508,1.303088,-1.079794,-0.061780,1.584877,...,-0.373509,-0.492546,-1.451239,-1.558661,1.543270,-0.432466,1.001483,0.105110,-0.353407,-0.470252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,-0.111941,0.188631,-0.641837,-0.202625,-0.818375,-0.288439,1.075934,-0.120698,1.264510,0.873380,...,0.003899,-0.425396,0.965118,0.412041,-0.689365,1.356174,0.597785,-1.158859,-0.242969,-0.392818
9993,-0.027647,-0.761730,0.016899,0.454269,-0.318463,-0.200570,0.985084,-0.836277,0.255424,0.702736,...,0.122011,-0.350132,-1.111038,-0.761130,0.767510,-0.015463,0.641642,0.308385,0.142201,-0.227694
9994,0.131468,-1.303262,-0.495489,0.439748,-0.024751,-0.202926,0.881126,-0.442212,0.197668,0.448099,...,-0.562248,-0.197272,-0.562486,-1.064052,1.203154,0.311104,0.661379,1.079791,0.166068,-0.206998
9995,-0.141282,0.023924,0.407277,-0.687252,0.958752,-1.433454,-0.778722,0.157742,0.812371,-0.032331,...,-0.143657,0.021474,-0.051579,0.217323,0.232293,-0.017911,-0.048681,0.903769,0.388418,-0.843421


In [33]:
drop_features = ["backdrop_path", "poster_path", "original_title", "id"]
df.drop(axis = 1, columns = drop_features, inplace = True)

In [34]:
df.head()

Unnamed: 0,adult,genre_ids,original_language,overview,popularity,release_date,title,video,vote_average,vote_count
0,False,"[14, 10751, 28]",en,수백년간 지속되어온 바이킹과 드래곤의 전쟁. 드래곤을 없애는 것이 삶의 모든 목적인...,682.6797,2025-06-06,드래곤 길들이기,False,8.1,1348
1,False,"[16, 28, 14, 53]",ja,혈귀로 변해버린 여동생 네즈코를 인간으로 되돌리기 위해 혈귀를 사냥하는 조직인 《귀...,618.9138,2025-08-22,극장판 귀멸의 칼날: 무한성편,False,6.897,39
2,False,"[10751, 878, 35, 12]",en,"보송보송한 파란 솜털, 호기심 가득한 큰 눈, 장난기 가득한 웃음을 가졌지만 가장 ...",491.2977,2025-05-21,릴로 & 스티치,False,7.343,1172
3,False,"[28, 878, 53]",en,강제 종료 후 다시 돌아온 돌AI ‘메간’. 쌍돌AI ‘아멜리아’의 위협으로 ‘메간...,429.1812,2025-07-16,메간 2.0,False,7.6,549
4,False,[35],en,해피 길모어가 골프를 그만뒀을까? 천만의 말씀. 다혈질의 골프 레전드로 분한 애덤 ...,422.7913,2025-07-25,해피 길모어 2,False,6.749,315


In [49]:
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re

# 1. 정규화 함수
def clean_korean_text(text):
    text = re.sub(r'[^가-힣\s]', '', str(text))  # 한글 + 공백만 남김
    return text.strip()

# 2. 형태소 분석기 초기화
okt = Okt()

# 3. tokenizer 함수 정의
def okt_tokenizer(text):
    return okt.nouns(text)  # 명사만 추출

# 4. 데이터 전처리
df['overview_clean'] = df['overview'].fillna("").apply(clean_korean_text)

JVMNotFoundException: No JVM shared library file (libjvm.so) found. Try setting up the JAVA_HOME environment variable properly.