# Meta data preprocessing
- Meta 데이터를 LLM for Embeddings 및 LLM as RS에 활용할 수 있도록 전처리합니다.
- 각 콘텐츠의 최적화된 번호와, 전처리된 meta 데이터 설명이 매칭되는 dictionary를 만듭니다.

- input: 최적화된 시퀀스 데이터, meta 데이터, 고유 ID - 최적화된 ID를 매칭하는 dictionary
- output: 최적화된 ID - 고유 meta description을 매칭하는 dictionary

In [4]:
import pandas as pd
from tqdm import tqdm
from collections import Counter
import numpy as np
import pickle

# Augmentation된 SASRec 시퀀스와 메타데이터 호출
df = pd.read_csv("/data/log-data-2024/Final_20241127/input_search_augmented_final_20241127.txt", header = None)
meta = pd.read_csv("/data/log-data-2024/1.meta_rot_preprocessing_ksc/data/Meta_241127.csv")
with open(file = '/data/log-data-2024/Final_20241127/match.pickle', mode='rb') as f:
    match = pickle.load(f)
    
meta = meta.dropna(subset = ["new_content_id"]).reset_index(drop = True)
meta["new_content_id"] = meta["new_content_id"].apply(lambda x : int(float(x)))

# SASRec 시퀀스에 포함된 콘텐츠의 meta만을 필터링
useful = []
check = [int(float(i)) for i in list(match.keys())]
for i in tqdm(list(meta["new_content_id"])):
    if i in check:
        useful.append(1)
    else:
        useful.append(0)
meta["useful"] = useful
meta = meta[meta["useful"] == 1].reset_index(drop = True)
match = dict(zip(check, list(match.values())))
meta["optimized_new_contents_id"] = meta["new_content_id"].apply(lambda x : int(match[x]))

In [6]:
import re

# genre -> 장르가 여러개고, 여러 언어로 기록되어 있어서 한국어만 남김
korean = re.compile("[가-힣]+")
meta["genre_info"] = [korean.findall(str(i)) for i in meta["genre_info"]]

genre_dict = {"서부" : "western", "뮤직" : "music", "로맨스" : "romance", "토크쇼예능" : "talk show",
              "어드벤처" : "adventure", "판타지" : "fantasy", "예능" : "entertainment", "리얼리티" : "reality show",
              "취미" : "hobby", "애니메이션" : "animation", "스릴러" : "thriller", "어린이" : "kids", "코미디" : "comedy",
              "에로" : "erotic", "드라마" : "drama", "액션" : "action", "시사" : "current affairs", "범죄" : "crime",
              "시사다큐" : "documentary", "뉴스" : "news", "멜로로맨스" : "melodrama", "스포츠" : "sports", "전쟁" : "war",
              "레저" : "leisure", "다큐" : "documentary", "미스터리" : "mystery", "취미레저" : "hobby", "멜로" : "melodrama",
              "호러" : "horror", "전기" : "history", "토크쇼" : "talk show", "뮤지컬" : "musical"}

temp = []
for i in meta["genre_info"]:
    try:
        temp.append(list(set([genre_dict[w] for w in i])))
    except:
        temp.append("etc")
meta["genre_info"] = temp

In [7]:
# actor -> 배우 이름 / 극중 이름으로 기재되어 있어서 배우 이름만 발라냄
meta["actor_info"] = meta["actor_info"].apply(lambda x : str(x).replace("\x0b", "").replace("\x0c", ""))

actor = re.compile("[0-9].+")
non_num = re.compile("[0-9]+")

res = []
for i in tqdm(meta["actor_info"]):
    temp = i.split("Actor")
    actors = []
    for w in temp[:3]:
        try:
            actors.append(re.sub(non_num, "", actor.findall(w)[0]))
        except:
            pass
    res.append(actors)
meta["actor_info"] = res

100%|█████████████████████████████████| 30090/30090 [00:00<00:00, 184137.31it/s]


In [8]:
# director -> 개행문자 제거
res = []
for i in meta["director_info"]:
    try:
        res.append(i.split("\x0b")[1])
    except:
        res.append(i)
meta["director_info"] = res

In [9]:
temp = meta[["title_name", "genre_info", "actor_info", "director_info", "cntry_code", "description", "optimized_new_contents_id"]]
col = list(temp.columns)[1:-1]

def transform(series):
    result = ""
    for a, i in enumerate(list(series.iloc[0])[1:-1]):
        result = result + col[a] + ": "
        try:
            if i == []:
                result = result + "None" + ", "
            elif type(i) == list:
                result = result + "["
                for w in i:
                    result = result + w + ", "
                result = result[:-2] + "]" + ", "
            else:
                result = result + i + ", "
        except:
            result = result + "None" + ", "
    return result[:-2]

In [10]:
result = {}
for i in tqdm(meta["optimized_new_contents_id"]):
    result[i] = transform(temp.loc[temp[temp["optimized_new_contents_id"] == i].index])

result

100%|███████████████████████████████████| 30090/30090 [00:26<00:00, 1153.85it/s]


{1979: 'genre_info: [drama], actor_info: None, director_info: Courtney Glaude, cntry_code: US, description: The psychology of domestic abuse from the perspectives of the abused, the abuser, and the witnesses.',
 3747: 'genre_info: [mystery, thriller], actor_info: None, director_info: Peter Medak, cntry_code: US, description: Rodney Alcala commits a series of grisly murders in the 1970s. While the mother of one of his victims desperately searches for answers, Rodney maintains his intelligent, smooth-talking persona, even appearing as a bachelor on "The Dating Game."',
 9871: "genre_info: [adventure], actor_info: None, director_info: Tom Clegg, cntry_code: US, description: A British officer (Sean Bean) tries to win his company's respect and foil Napoleon's forces in Spain.",
 21010: 'genre_info: [drama], actor_info: None, director_info: Tinto Brass, cntry_code: US, description: Old passions are re-ignited when a married couple returns to 1947 Capri in search of their past lovers.',
 1744

In [None]:
# with open(file='/data/log-data-2024/Final_20241127/content_meta_dict_20241127.pickle', mode='wb') as f:
#     pickle.dump(result, f) # 매칭된 딕셔너리 저장
    
# meta.to_csv("/data/log-data-2024/Final_20241127/Meta_20241127_optimized.csv") # SASRec에 쓰인 데이터만 담고 있는 meta 데이터 저장