# **Request_LLM**

In [None]:
!pip install openai==0.28

In [None]:
import openai
import os

# OpenAI API 키 설정
openai.api_key = 'Your_own_key'

def Request_LLM(model="gpt-3.5-turbo", messages=None, temperature=0.5, max_tokens=4096):
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens
        )
        return response  # LLM의 응답 반환

    except Exception as e:
        print(f"Error: {str(e)}")
        return f"Error: {str(e)}"

    return response  # LLM의 응답 반환

# **class Reranking**

In [None]:
# Request_LLM() = Request for access to your LLM
# Due to privacy factors, this code is only used as a functional reference and cannot be run directly


class Reranking():
    def __init__(self, dataset_name, nodes=["diversity", "accuracy", "fairness", "stop", "backward"],
                 user_fea=None, his_item_fea=None, item_fea=None, top_k=10, re_history=[[], []],
                 data=None, focus="Overall Performance", history_max=5, max_count=3):
        self.nodes = nodes
        self.user_fea = user_fea
        self.his_item_fea = his_item_fea
        self.item_fea = item_fea
        self.top_k = top_k
        self.re_history = re_history
        self.data = data
        self.candidate_id = data["user_info"]["recommend_list"]
        self.focus = focus
        self.history_max = history_max
        self.max_count = max_count
        self.dataset_name = dataset_name

    def update_data(self, data):
        self.data = data
        self.candidate_id = self.data["user_info"]["recommend_list"]

    def request(self, prompt, current_name):
        print("Requesting...")
        response_ori = Request_LLM(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": prompt}
            ]
        ).choices[0].message.content
        while True:
            try:
                if current_name == 'backward':
                    print("Backwarding and redirecting...")
                    return getattr(self, response_ori + '_rerank')()

                flag = 0
                while True:
                    # LLM 응답에서 추천 결과, 이유, 다음 노드 추출
                    print("Raw Response from LLM:", response_ori)
                    response_parts = response_ori.split(";")
                    response_items = [int(i.strip()) for i in response_parts[0].split("[")[-1].split("]")[0].split(",")]
                    reasoning_sentence = response_parts[1].strip() if len(response_parts) > 1 else ""
                    operation_name = response_parts[2].lower().strip("_,. ").replace('"', '').replace("'", "") if len(response_parts) > 2 else "stop"



                    # check id
                    for re in response_items:
                        if int(re) not in self.candidate_id:
                            flag += 1
                            response_ori = Request_LLM(
                                model="gpt-3.5-turbo",
                                messages=[
                                    {"role": "user", "content": prompt},
                                    {"role": "system", "content": response_ori},
                                    {"role": "user",
                                     "content": "Reranking id '{}' in your reply does not appear in the candidates provided. "
                                                "You should make sure that your reranking list is a reorder of the original "
                                                "candidate ids, including each candidate id once and only once. "
                                                "Please check and answer again.".format(re)},
                                ]
                            ).choices[0].message.content
                            print("Response incorrect, answer id not in candidates: ",  response_items)
                            break
                    for re in self.candidate_id:
                        if int(re) not in response_items:
                            flag += 1
                            response_ori = Request_LLM(
                                model="gpt-3.5-turbo",
                                messages=[
                                    {"role": "user", "content": prompt},
                                    {"role": "system", "content": response_ori},
                                    {"role": "user",
                                     "content": "Candidate item id '{}' does not appear in the reranking list of your reply\item"
                                                "You should make sure that your reranking list is a reorder of the original "
                                                "candidate ids, including each candidate id once and only once. "
                                                "Please check and answer again.".format(str(re))},
                                ]
                            ).choices[0].message.content
                            # 디버깅용: 데이터 타입 출력
                            print("Candidate ID List:", self.candidate_id)  # 후보 영화 ID 리스트
                            print("Candidate ID Type:", type(self.candidate_id[0]))  # 리스트의 첫 번째 원소 타입 확인
                            print("Response Items:", response_items)  # LLM의 응답에서 추출한 추천 리스트
                            print("Response Items Type:", type(response_items[0]))  # 리스트의 첫 번째 원소 타입 확인

                            print("Response incorrect, candidate id not in answer: ",  response_items)
                            break
                    if flag > 0:
                        flag = 0
                        continue
                    # check id num
                    if len(response_items) != len(self.candidate_id):
                        response_ori = Request_LLM(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "user", "content": prompt},
                                {"role": "system", "content": response_ori},
                                {"role": "user",
                                 "content": "The number of ids in the reply does not match the required number ({}). "
                                            "You should make sure that your reranking list is a reorder of the original "
                                            "candidate ids, including each candidate id once and only once. "
                                            "Please check and answer again.".format(str(len(self.candidate_id)))},
                            ]
                        ).choices[0].message.content
                        print("Response incorrect, reranking length error: ",  response_items)
                        continue
                    # check node name
                    if operation_name not in self.nodes:
                        response_ori = Request_LLM(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "user", "content": prompt},
                                {"role": "system", "content": response_ori},
                                {"role": "user",
                                 "content": "The node name doesn't exist in ({}). "
                                            "You should make sure that a correct node name is followed by your reranking list. "
                                            "Please check and answer again.".format(','.join(self.nodes))},
                            ]
                        ).choices[0].message.content
                        print("Response incorrect, node name error: ", operation_name)
                        continue
                    # response correct
                    print("Response correct: ", operation_name)
                    print(f"Reasoning: {reasoning_sentence}")
                    break
            except:
                response_ori = Request_LLM(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt},
                        {"role": "system", "content": response_ori},
                        {"role": "user",
                         "content": "Answer format or is incorrect. Please check and answer again."},
                    ]
                ).choices[0].message.content
                print("Response incorrect, reranking format error: ")
                continue
            else:
                self.re_history[0].append(current_name)
                self.re_history[1].append(response_items)
                if len(self.re_history[0]) >= self.max_count:
                    operation_name = 'stop'
                return getattr(self,operation_name + '_rerank')()

    def backward_rerank(self):
        current_name = 'backward'
        self.re_history[0] = self.re_history[0][:-1]
        self.re_history[1] = self.re_history[1][:-1]

        prompt = "Considering a user, his/her basic infomation is: \n{"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["user_info"][fea]))
        prompt += "}\nHis/Her history of browsing items and related features are: \n{"
        his_count = 1
        for _, item in self.data["history_items"].iterrows():
            prompt += "["
            for fea in self.his_item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "]\n"
            his_count += 1
            if his_count > self.history_max:
                break
        prompt += "}\nHere's a list of the candidate items (with related features) he/she might see next: \n{"
        for i, item in self.data["candidate_items"].iterrows():
            prompt += "["
            for fea in self.item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "score:{}]\n".format(i + 1)
        prompt += ("}\nNote that the 'score' feature represents the ranking position"
                  "assigned by an existing recommendation model. "
                  "Lower scores indicate higher priority in the original ranking. ")

        if len(self.re_history[0]) > 0:
            prompt += "Your historical operations and reranking results represented by item_id are as follows: {\n"
            for i in range(len(self.re_history[0])):
                #prompt += "Focus: {}. Reranking Result: [{}]\n".format(self.re_history[0][i],
                #                                                       ", ".join(self.re_history[1][i]))
                prompt += "Focus: {}. Reranking Result: [{}]\n".format(
                    self.re_history[0][i], ", ".join(map(str, self.re_history[1][i]))
                    )

            prompt += "}\n"
        else:
            prompt += ("Your historical operations and reranking results represented by item_id are as follows: \n"
                       "{No historical operations yet.}\n")
        if len(self.re_history[0]) > 0:
            prompt += (
                    "Now, you need to give suggestions about the next step of reranking from the following reranking "
                    "focus: \n{" + ', '.join(
                self.nodes) + "}\n"
                              "Specially, 'stop' means to stop reranking in the next step "
                              "and output the result of the current step as the final reranking output. And 'backward' means to "
                              "delete the latest reranking operation and result so that they are not taken into consideration by"
                              " subsequent operations.\n"
                              "Your decision should be based on your final goal of the reranking:\n{" + self.focus + "}.\n")
        else:
            nodes = self.nodes.copy()
            nodes.remove('stop')
            prompt += (
                    "Now, you need to give suggestions about the next step of reranking from the following reranking "
                    "focus: \n{" + ', '.join(
                nodes) + "}\nSpecially, 'backward' means to "
                         "delete the latest reranking operation and result so that they are not taken into consideration by"
                         " subsequent operations.\n"
                         "Your decision should be based on your final goal of the reranking:\n{" + self.focus + "}.\n")

        prompt += (
            "\nFor your response format, please only give me a word of operation name you suggest to do next from "
            "the list of reranking focus as your answer without any punctuation,"
            " and omit anything else such as your thinking and decision-making process.")

        return self.request(prompt, current_name)

    def stop_rerank(self):
        current_name = 'stop'
        re_history = self.re_history
        self.re_history = [[], []]
        response = [int(i) for i in re_history[1][-1]]
        score = [response.index(i) for i in self.candidate_id]
        return response, re_history, score

    def accuracy_rerank(self):
        current_name = 'accuracy'
        prompt = "Considering a user, his/her basic infomation is: \n{"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["user_info"][fea]))
        prompt += "}\nHis/Her history of browsing items and related features are: \n{"
        his_count = 1
        for _, item in self.data["history_items"].iterrows():
            prompt += "["
            for fea in self.his_item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "]\n"
            his_count += 1
            if his_count > self.history_max:
                break
        prompt += "}\nHere's a list of the candidate items (with related features) he/she might see next: \n{"
        for i, item in self.data["candidate_items"].iterrows():
            prompt += "["
            for fea in self.item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "score:{}]\n".format(i + 1)
        prompt += ("}\nNote that the 'score' feature represents the ranking position"
                " assigned by an existing recommendation model. "
                "Lower scores indicate higher priority in the original ranking. \n")

        prompt += ("Considering similar users, these users have similar movie preferences to the target user:\n")
        prompt += "Similar User 1: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user1"][fea]))
        prompt += "}\n"
        prompt += "Similar User 2: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user2"][fea]))
        prompt += "}\n"
        prompt += "Similar User 3: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user3"][fea]))
        prompt += "}\n"

        if len(self.re_history[0]) > 0:
            prompt += "Your historical operations and reranking results represented by item_id are as follows: {\n"
            for i in range(len(self.re_history[0])):
                #prompt += "Focus: {}. Reranking Result: [{}]\n".format(self.re_history[0][i],
                #                                                      ", ".join(self.re_history[1][i]))
                prompt += "Focus: {}. Reranking Result: [{}]\n".format(
                    self.re_history[0][i], ", ".join(map(str, self.re_history[1][i])))

            prompt += "}\n"
        else:
            prompt += ("Your historical operations and reranking results represented by item_id are as follows: \n"
                       "{No historical operations yet.}\n")

        prompt += ("Now, you need to focus on the " + current_name + " aspect (the match between the user and items) "
                                                                     "and rerank the candidates based on the given "
                                                                     "information, and then give suggestions about the next step of reranking from the following reranking "
                                                                     "focus: \n{" + ', '.join(
            self.nodes) + "}\nSpecially, 'stop' means to stop reranking in the next step "
                          "and output the result of the current step as the final reranking output. And 'backward' means to "
                          "delete the latest reranking operation and result so that they are not taken into consideration by"
                          " subsequent operations.")

        prompt += (
                "\nThe order of reranking result should represent how likely the user is to watch it."
                "For suggestions about the next step, You should choose one of "
                "the functions representing the next reranking focus or 'stop' based on the final goal of the reranking:\n{" + self.focus + "}.\n"

                                                                                                                                            "For your response format, please only give me a list of item_id (containing all item_id values in the "
                                                                                                                                            "candidates list provided above and not containing any other item_id not mentioned) in order of recommendation priority,"
                                                                                                                                            "followed by a brief explanation (one sentence) of the reasoning behind your decision, "
                                                                                                                                            "and an operation name you suggest to do next from the list of reranking focus as your answer. "
                                                                                                                                            "Your response should follow the format strictly and include no additional details. "
                                                                                                                                            "\nExample answer format for 10 candidates: [16, 1246, 536, 15, 748, 478, 899, 151, 1032, 165]; Due to the user's preference for action and drama, higher-ranked items match these genres better; stop"

                                                                                                                                            )

        return self.request(prompt, current_name)

    def diversity_rerank(self):
        current_name = 'diversity'
        prompt = "Considering a user, his/her basic infomation is: \n{"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["user_info"][fea]))
        prompt += "}\nHis/Her history of browsing items and related features are: \n{"
        his_count = 1
        for _, item in self.data["history_items"].iterrows():
            prompt += "["
            for fea in self.his_item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "]\n"
            his_count += 1
            if his_count > self.history_max:
                break
        prompt += "}\nHere's a list of the candidate items (with related features) he/she might see next: \n{"
        for i, item in self.data["candidate_items"].iterrows():
            prompt += "["
            for fea in self.item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "score:{}]\n".format(i + 1)
        prompt += ("}\nNote that the 'score' feature represents the ranking position"
                  " assigned by an existing recommendation model. "
                  "Lower scores indicate higher priority in the original ranking. \n")

        prompt += ("Considering similar users, these users have similar movie preferences to the target user:\n")
        prompt += "Similar User 1: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user1"][fea]))
        prompt += "}\n"
        prompt += "Similar User 2: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user2"][fea]))
        prompt += "}\n"
        prompt += "Similar User 3: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user3"][fea]))
        prompt += "}\n"

        if len(self.re_history[0]) > 0:
            prompt += "Your historical operations and reranking results represented by item_id are as follows: {\n"
            for i in range(len(self.re_history[0])):
                #prompt += "Focus: {}. Reranking Result: [{}]\n".format(self.re_history[0][i],
                #                                                      ", ".join(self.re_history[1][i]))
                prompt += "Focus: {}. Reranking Result: [{}]\n".format(self.re_history[0][i], ", ".join(map(str, self.re_history[1][i])))
            prompt += "}\n"
        else:
            prompt += ("Your historical operations and reranking results represented by item_id are as follows: \n"
                       "{No historical operations yet.}\n")
        if self.dataset_name == 'ml-1m':
            prompt += (
                    "Now, you need to focus on the " + current_name + " aspect. Specifically, rerank the items to ensure that "
        "items from different 'genre' categories are prioritized and placed at the top of the list.")
        prompt += (" and rerank the candidates based on the given "
                   "information, and then give suggestions about the next step of reranking from the following reranking "
                   "focus: \n{" + ', '.join(
            self.nodes) + "}\nSpecially, 'stop' means to stop reranking in the next step "
                          "and output the result of the current step as the final reranking output. And 'backward' means to "
                          "delete the latest reranking operation and result so that they are not taken into consideration by"
                          " subsequent operations.")

        prompt += (
                "\nThe order of reranking result should represent how likely the user is to watch it."
                "For suggestions about the next step, You should choose one of "
                "the functions representing the next reranking focus or 'stop' based on the final goal of the reranking:\n{" + self.focus + "}.\n"

                                                                                                                                             "For your response format, please only give me a list of item_id (containing all item_id values in the "
                                                                                                                                            "candidates list provided above and not containing any other item_id not mentioned) in order of recommendation priority, "
                                                                                                                                            "an operation name you suggest to do next from the list of reranking focus as your answer,"
                                                                                                                                            "followed by a brief explanation (one sentence) of the reasoning behind your decision, "
                                                                                                                                            "and an operation name you suggest to do next from the list of reranking focus as your answer. "
                                                                                                                                            "Your response should follow the format strictly and include no additional details. "
                                                                                                                                            "\nExample answer format for 10 candidates: [16, 1246, 536, 15, 748, 478, 899, 151, 1032, 165]; Due to the user's preference for action and drama, higher-ranked items match these genres better; stop"
                                                                                                                                            )

        return self.request(prompt, current_name)

    def fairness_rerank(self):
        current_name = 'fairness'
        prompt = "Considering a user, his/her basic infomation is: \n{"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["user_info"][fea]))
        prompt += "}\nHis/Her history of browsing items and related features are: \n{"
        his_count = 1
        for _, item in self.data["history_items"].iterrows():
            prompt += "["
            for fea in self.his_item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "]\n"
            his_count += 1
            if his_count > self.history_max:
                break
        prompt += "}\nHere's a list of the candidate items (with related features) he/she might see next: \n{"
        for i, item in self.data["candidate_items"].iterrows():
            prompt += "["
            for fea in self.item_fea:
                prompt += "{}:{},".format(fea, str(item[fea]))
            prompt += "score:{}]\n".format(i + 1)
        prompt += ("}\nNote that the 'score' feature represents the ranking position"
                  " assigned by an existing recommendation model. "
                  "Lower scores indicate higher priority in the original ranking. ")

        prompt += ("Considering similar users, these users have similar movie preferences to the target user:\n")
        prompt += "Similar User 1: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user1"][fea]))
        prompt += "}\n"
        prompt += "Similar User 2: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user2"][fea]))
        prompt += "}\n"
        prompt += "Similar User 3: {"
        for fea in self.user_fea:
            prompt += "{}:{},".format(fea, str(self.data["similar_user3"][fea]))
        prompt += "}\n"

        if len(self.re_history[0]) > 0:
            prompt += "Your historical operations and reranking results represented by item_id are as follows: {\n"
            for i in range(len(self.re_history[0])):
               # prompt += "Focus: {}. Reranking Result: [{}]\n".format(self.re_history[0][i],
               #                                                       ", ".join(self.re_history[1][i]))
                prompt += "Focus: {}. Reranking Result: [{}]\n".format(self.re_history[0][i], ", ".join(map(str, self.re_history[1][i])))
            prompt += "}\n"
        else:
            prompt += ("Your historical operations and reranking results represented by item_id are as follows: \n"
                       "{No historical operations yet.}\n")

        prompt += ( "Now, you need to focus on the " + current_name + " aspect in terms of'release_year'."
                  "(For items with 'release_year' feature before 1985 items with 'release_year' feature between 1986 and 1995, and items with 'release_year' feature after 1996, "
                   "You should keep the average ranking of the three categories in the candidate items similar) ")

        prompt += ("and rerank the candidates based on the given "
                   "information, and then give suggestions about the next step of reranking from the following reranking "
                   "focus: \n{" + ', '.join(
                       self.nodes) + "}\nSpecially, 'stop' means to stop reranking in the next step "
                       "and output the result of the current step as the final reranking output. And 'backward' means to "
                       "delete the latest reranking operation and result so that they are not taken into consideration by"
                       " subsequent operations.")

        prompt += (
                "\nThe order of reranking result should represent how likely the user is to watch it."
                "For suggestions about the next step, You should choose one of "
                "the functions representing the next reranking focus or 'stop' based on the final goal of the reranking:\n{" + self.focus + "}.\n"

                                                                                                                                             "For your response format, please only give me a list of item_id (containing all item_id values in the "
                                                                                                                                            "candidates list provided above and not containing any other item_id not mentioned) in order of recommendation priority, "
                                                                                                                                            "followed by a brief explanation (one sentence) of the reasoning behind your decision, "
                                                                                                                                            "and an operation name you suggest to do next from the list of reranking focus as your answer. "
                                                                                                                                            "Your response should follow the format strictly and include no additional details. "
                                                                                                                                            "\nExample answer format for 10 candidates: [16, 1246, 536, 15, 748, 478, 899, 151, 1032, 165]; Due to the user's preference for action and drama, higher-ranked items match these genres better; stop"

                                                                                                                                            )


        return self.request(prompt, current_name)


# **Cluster 0 데이터 가져오기**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **formatted data 생성**

In [None]:
import json
import ast
import pandas as pd

# JSON 파일 로드
with open("/content/drive/MyDrive/BOAZ_미니프로젝트1/Code정리/Phase3/NewCluster별_llm input모음/cluster0/cluster0.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# user_info
user_dict = ast.literal_eval(data['user_info'])
user_df = pd.DataFrame([user_dict])
data["user_info"] = user_df

# candidate_items, history_items
candidate_items_df = pd.read_csv("/content/drive/MyDrive/BOAZ_미니프로젝트1/Code정리/Phase3/NewCluster별_llm input모음/cluster0/cluster0_targetuser_rec.csv")
history_items_df = pd.read_csv("/content/drive/MyDrive/BOAZ_미니프로젝트1/Code정리/Phase3/NewCluster별_llm input모음/cluster0/newcluster0_targetuser_seen.csv")
data["candidate_items"] = candidate_items_df.iloc[:, 1:]
data["history_items"] = history_items_df.iloc[:, 1:]

# similar_user1
similar_user1_dict = ast.literal_eval(data['similar_user1'])
similar_user1_df = pd.DataFrame([similar_user1_dict])
data["similar_user1"] = similar_user1_df

# similar_user2
similar_user2_dict = ast.literal_eval(data['similar_user2'])
similar_user2_df = pd.DataFrame([similar_user2_dict])
data["similar_user2"] = similar_user2_df

# similar_user3
similar_user3_dict = ast.literal_eval(data['similar_user3'])
similar_user3_df = pd.DataFrame([similar_user3_dict])
data["similar_user3"] = similar_user3_df

In [None]:
import pandas as pd
import ast  # 문자열을 리스트로 변환하는 데 사용

# 데이터 변환 함수
def parse_recommend_list(recommend_str):
    """recommend_list를 문자열에서 리스트로 변환"""
    return ast.literal_eval(recommend_str) if isinstance(recommend_str, str) else recommend_str

In [None]:
formatted_data = {
    "user_info": {
        "user_id": data["user_info"]["user_id"].iloc[0],  # Series -> 값만 가져옴
        "age": data["user_info"]["age"].iloc[0],
        "gender": data["user_info"]["gender"].iloc[0],
        "job": data["user_info"]["job"].iloc[0],
        "prefer_genre1": data["user_info"]["prefer_genre1"].iloc[0],
        "prefer_genre2": data["user_info"]["prefer_genre2"].iloc[0],
        "recommend_list": parse_recommend_list(data["user_info"]["recommend_list"].iloc[0])
    },
    "history_items": pd.DataFrame(data["history_items"]).rename(columns={"movie_id": "item_id"}).reset_index(drop=True),
    "candidate_items": pd.DataFrame(data["candidate_items"]).rename(columns={"movie_id": "item_id"}).reset_index(drop=True),
    "similar_user1": {
        "user_id": data["similar_user1"]["user_id"].iloc[0],
        "age": data["similar_user1"]["age"].iloc[0],
        "gender": data["similar_user1"]["gender"].iloc[0],
        "job": data["similar_user1"]["job"].iloc[0],
        "prefer_genre1": data["similar_user1"]["prefer_genre1"].iloc[0],
        "prefer_genre2": data["similar_user1"]["prefer_genre2"].iloc[0],
        "recommend_list": parse_recommend_list(data["similar_user1"]["recommend_list"].iloc[0])
    },
    "similar_user2": {
        "user_id": data["similar_user2"]["user_id"].iloc[0],
        "age": data["similar_user2"]["age"].iloc[0],
        "gender": data["similar_user2"]["gender"].iloc[0],
        "job": data["similar_user2"]["job"].iloc[0],
        "prefer_genre1": data["similar_user2"]["prefer_genre1"].iloc[0],
        "prefer_genre2": data["similar_user2"]["prefer_genre2"].iloc[0],
        "recommend_list": parse_recommend_list(data["similar_user2"]["recommend_list"].iloc[0])
    },
    "similar_user3": {
        "user_id": data["similar_user3"]["user_id"].iloc[0],
        "age": data["similar_user3"]["age"].iloc[0],
        "gender": data["similar_user3"]["gender"].iloc[0],
        "job": data["similar_user3"]["job"].iloc[0],
        "prefer_genre1": data["similar_user3"]["prefer_genre1"].iloc[0],
        "prefer_genre2": data["similar_user3"]["prefer_genre2"].iloc[0],
        "recommend_list": parse_recommend_list(data["similar_user3"]["recommend_list"].iloc[0])
    }
}

In [None]:
formatted_data

{'user_info': {'user_id': 5613,
  'age': 35,
  'gender': 'F',
  'job': 'other or not specified',
  'prefer_genre1': 'Comedy',
  'prefer_genre2': 'Drama',
  'recommend_list': [1265, 1, 1307, 608, 527, 356, 2028, 296, 2797, 1198]},
 'history_items':    item_id  release_year language  \
 0     2791          1980  english   
 1     2000          1987  english   
 2      223          1994  english   
 3     1948          1963  english   
 4     2997          1999  english   
 5     2174          1988  english   
 6      778          1996  english   
 7     2396          1998  english   
 8     1394          1987  english   
 9     1235          1971  english   
 
                                             keywords  \
 0  chicago, alcohol, cataclysm, guitar, medicine,...   
 1  self-destruction, los angeles, police detectiv...   
 2             salesclerk, loser, aftercreditsstinger   
 3                 from rags to riches, tutor, squire   
 4  individual, transvestism, sexual identity, w

In [None]:
formatted_data['user_info']

{'user_id': 5613,
 'age': 35,
 'gender': 'F',
 'job': 'other or not specified',
 'prefer_genre1': 'Comedy',
 'prefer_genre2': 'Drama',
 'recommend_list': [1265, 1, 1307, 608, 527, 356, 2028, 296, 2797, 1198]}

## **Reranking start with Accuracy**

### **Reranking 객체 생성**

In [None]:
reranker = Reranking(
    dataset_name="ml-1m",
    user_fea=["user_id", "age", "gender", "job", "prefer_genre1", "prefer_genre2", "recommend_list"],  # 사용자 특성 리스트
    his_item_fea=["item_id", "release_year", "language", "keywords","genre"],  # 사용자의 히스토리 아이템의 특성 리스트
    item_fea=["item_id", "release_year", "language", "keywords","genre"],  # 추천 후보 아이템들의 특성 리스트
    data=formatted_data
)

In [None]:
reranker.accuracy_rerank()

Requesting...
Raw Response from LLM: [1265, 1307, 1198, 2797, 356, 296, 608, 527, 2028, 1]; The reranked items align more closely with the user's preferred genres of Comedy and Drama; stop
Response correct:  stop
Reasoning: The reranked items align more closely with the user's preferred genres of Comedy and Drama


([1265, 1307, 1198, 2797, 356, 296, 608, 527, 2028, 1],
 [['accuracy'], [[1265, 1307, 1198, 2797, 356, 296, 608, 527, 2028, 1]]],
 [0, 9, 1, 6, 7, 4, 8, 5, 3, 2])

In [None]:
formatted_data['user_info']['recommend_list']

[1265, 1, 1307, 608, 527, 356, 2028, 296, 2797, 1198]

**[기존의 ranking]**
- [1265, 1, 1307, 608, 527, 356, 2028, 296, 2797, 1198]


**[ReArrange된 ranking]**
- [1265, 1307, 1198, 2797, 356, 296, 608, 527, 2028, 1]

### **alpha-NDCG 함수 생성**

In [None]:
movies=pd.read_csv('/content/drive/MyDrive/BOAZ_미니프로젝트1/Code정리/Phase3/NewCluster별_llm input모음/cluster0/cluster0_targetuser_rec.csv',index_col=False)
movies.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
movies

Unnamed: 0,movie_id,release_year,language,keywords,genre
0,2797,1988,english,"baseball, co-worker, bronx, pinball machine, t...","Comedy, Fantasy"
1,527,1993,english,"factory, concentration camp, hero, holocaust, ...","Drama, War"
2,1,1995,english,"jealousy, toy, boy, friendship, friends, rival...","Animation, Children's, Comedy"
3,2028,1998,english,"war crimes, self sacrifice, war veteran, world...","Action, Drama, War"
4,608,1996,english,"cheating, ransom, salesclerk, winter, kidnappi...","Crime, Drama, Thriller"
5,1265,1993,english,"deja vu, groundhog, weather forecast, telecast...","Comedy, Romance"
6,1198,1981,english,"saving the world, riddle, nepal, himalaya, cai...","Action, Adventure"
7,356,1994,english,"vietnam veteran, hippie, mentally disabled, ru...","Comedy, Romance, War"
8,296,1994,english,"transporter, brothel, drug dealer, boxer, mass...","Crime, Drama"
9,1307,1989,english,"new york, wife husband relationship, restauran...","Comedy, Romance"


In [None]:
movies['release_year']=movies['release_year'].astype(int)

In [None]:
# movie id x release year matrix 만들기
# fairness에서 수정된 부분은 (1985 이전, 1986-1995, 1996 이후) 3그룹으로 수정
unique_years=['Before 1985','Btw 1986-1995','After 1996']
relevance_matrix_year=pd.DataFrame(0,index=movies['movie_id'],columns=unique_years)
# 각 영화 ID에 대해 해당 연도에 1 할당
for idx, row in movies.iterrows():
    if row["release_year"]<=1986:
      relevance_matrix_year.loc[row["movie_id"], 'Before 1985'] = 1
    if row['release_year']>1986 and row['release_year']<=1995:
      relevance_matrix_year.loc[row["movie_id"], 'Btw 1986-1995'] = 1
    if row['release_year']>=1996:
      relevance_matrix_year.loc[row["movie_id"], 'After 1996'] = 1

In [None]:
relevance_matrix_year

Unnamed: 0_level_0,Before 1985,Btw 1986-1995,After 1996
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2797,0,1,0
527,0,1,0
1,0,1,0
2028,0,0,1
608,0,0,1
1265,0,1,0
1198,1,0,0
356,0,1,0
296,0,1,0
1307,0,1,0


In [None]:
# 모든 고유 장르 추출
unique_genres = sorted(set(g for genres in movies["genre"] for g in genres.split(", ")))

# 장르 관련성 매트릭스 생성
relevance_matrix_genre = pd.DataFrame(0, index=movies["movie_id"], columns=unique_genres)

# 각 영화 ID에 대해 해당 장르에 1 할당
for idx, row in movies.iterrows():
    genres = row["genre"].split(", ")
    relevance_matrix_genre.loc[row["movie_id"], genres] = 1

In [None]:
# 🎯 alpha-nDCG 계산 함수
import numpy as np
def alpha_ndcg(ranked_items, relevance_matrix, alpha=0.7, k=10):
    num_genres = relevance_matrix.shape[1]  # 고유 장르 개수
    gain = np.zeros(len(ranked_items))  # DG 값 초기화
    ideal_gain = np.zeros(len(ranked_items))  # IDG 값 초기화
    accumulated_relevance = np.zeros(num_genres)  # 누적된 관련성 저장용

    ranked_items = ranked_items[:k]  # 상위 k개 항목만 고려

    # 🎯 실제 DG (Discounted Gain) 계산
    for i, movie in enumerate(ranked_items):
        if movie in relevance_matrix.index:
            rel_vector = relevance_matrix.loc[movie].values  # 영화의 장르 벡터
            gain[i] = np.sum(rel_vector * (1 - alpha) ** accumulated_relevance) / np.log2(i + 2)
            accumulated_relevance += rel_vector  # 누적 관련성 업데이트
    # 🎯 이상적인 IDG (Ideal Discounted Gain) 계산
    sorted_movies = relevance_matrix.sum(axis=1).sort_values(ascending=False).index.tolist()  # 이상적인 랭킹
    accumulated_relevance = np.zeros(num_genres)  # 초기화

    for i, movie in enumerate(sorted_movies[:k]):  # 상위 k개만 고려
        if movie in relevance_matrix.index:
            rel_vector = relevance_matrix.loc[movie].values
            ideal_gain[i] = np.sum(rel_vector * (1 - alpha) ** accumulated_relevance) / np.log2(i + 2)
            accumulated_relevance += rel_vector  # 누적 관련성 업데이트
    # 🎯 alpha-nDCG 계산
    dcg = np.sum(gain)
    idcg = np.sum(ideal_gain)

    alpha_ndcg_score = dcg / idcg if idcg > 0 else 0  # 정규화하여 1을 초과하지 않도록 함
    alpha_ndcg_score = min(alpha_ndcg_score, 1.0)  # 만약 1을 초과하면 1로 클리핑
    print("alpha-nDCG:")
    return round(alpha_ndcg_score,5)

### **MAD 함수 생성**

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations

def calculate_mad(recommended_list, genre_matrix):
    # 추천 리스트에서 영화의 장르 벡터 가져오기
    genre_vectors = genre_matrix.loc[recommended_list].values  # (num_movies, num_genres) 행렬

    # 영화 조합(pair) 생성
    pairs = list(combinations(range(len(recommended_list)), 2))  # 영화 ID의 모든 쌍 조합
    # 각 영화 쌍의 장르 벡터 차이 계산 (L1 distance) 및 정규화
    diff_sum = 0
    num_genres = genre_matrix.shape[1]  # 장르 개수 (정규화에 사용)

    for i, j in pairs:
        l1_distance = np.sum(np.abs(genre_vectors[i] - genre_vectors[j]))  # L1 norm
        diff_sum += l1_distance / num_genres  # 🔥 장르 개수로 정규화

    # 평균 차이 계산 (조합 개수로 나눔)
    num_pairs = len(pairs)
    mad_score = diff_sum / num_pairs if num_pairs > 0 else 0

    print("MAD:")
    return round(mad_score,5)

### **alpha-NDCG, MAD 계산**

- [1265, 1, 1307, 608, 527, 356, 2028, 296, 2797, 1198]


**[ReArrange된 ranking]**
- [1265, 1307, 2797, 356, 296, 608, 2028, 527, 1198, 1]

In [None]:
# 기존의 재정렬되지 않은 rank
rank1=[1265, 1, 1307, 608, 527, 356, 2028, 296, 2797, 1198]
print('About Genre')
print(alpha_ndcg(rank1, relevance_matrix_genre, alpha=0.7))
print(calculate_mad(rank1, relevance_matrix_genre))
print('\nAbout Release Year')
print(alpha_ndcg(rank1, relevance_matrix_year, alpha=0.7))
print(calculate_mad(rank1, relevance_matrix_year))

About Genre
alpha-nDCG:
0.86034
MAD:
0.33939

About Release Year
alpha-nDCG:
0.97221
MAD:
0.34074


In [None]:
# 재정렬된 rank
rank2=[1265, 1307, 1198, 2797, 356, 296, 608, 527, 2028, 1]
print('About Genre')
print(alpha_ndcg(rank2, relevance_matrix_genre, alpha=0.7))
print(calculate_mad(rank2, relevance_matrix_genre))
print('\nAbout Release Year')
print(alpha_ndcg(rank2, relevance_matrix_year, alpha=0.7))
print(calculate_mad(rank2, relevance_matrix_year))

About Genre
alpha-nDCG:
0.80814
MAD:
0.33939

About Release Year
alpha-nDCG:
1.0
MAD:
0.34074


## **Reranking start with Fairness**

### **Reranking 객체 생성**

In [None]:
reranker = Reranking(
    dataset_name="ml-1m",
    user_fea=["user_id", "age", "gender", "job", "prefer_genre1", "prefer_genre2", "recommend_list"],  # 사용자 특성 리스트
    his_item_fea=["item_id", "release_year", "language", "keywords","genre"],  # 사용자의 히스토리 아이템의 특성 리스트
    item_fea=["item_id", "release_year", "language", "keywords","genre"],  # 추천 후보 아이템들의 특성 리스트
    data=formatted_data
)

In [None]:
reranker.fairness_rerank()

Requesting...
Raw Response from LLM: [2797, 1, 2028, 608, 356, 296, 1307, 1198, 527, 1265]; Reranked based on the fairness aspect of 'release_year' as requested; stop
Response correct:  stop
Reasoning: Reranked based on the fairness aspect of 'release_year' as requested


([2797, 1, 2028, 608, 356, 296, 1307, 1198, 527, 1265],
 [['accuracy', 'fairness'],
  [[1265, 1307, 1198, 2797, 356, 296, 608, 527, 2028, 1],
   [2797, 1, 2028, 608, 356, 296, 1307, 1198, 527, 1265]]],
 [9, 1, 6, 3, 8, 4, 2, 5, 0, 7])

### **alpha-NDCG, MAD 계산**

In [None]:
rank3=[2797, 1, 2028, 608, 356, 296, 1307, 1198, 527, 1265]
print('About Genre')
print(alpha_ndcg(rank3, relevance_matrix_genre, alpha=0.7))
print(calculate_mad(rank3, relevance_matrix_genre))
print('\nAbout Release Year')
print(alpha_ndcg(rank3, relevance_matrix_year, alpha=0.7))
print(calculate_mad(rank3, relevance_matrix_year))

About Genre
alpha-nDCG:
0.91575
MAD:
0.33939

About Release Year
alpha-nDCG:
1.0
MAD:
0.34074


## **Reranking start with Diversity**

### **Reranking 객체 생성**

In [None]:
reranker = Reranking(
    dataset_name="ml-1m",
    user_fea=["user_id", "age", "gender", "job", "prefer_genre1", "prefer_genre2", "recommend_list"],  # 사용자 특성 리스트
    his_item_fea=["item_id", "release_year", "language", "keywords","genre"],  # 사용자의 히스토리 아이템의 특성 리스트
    item_fea=["item_id", "release_year", "language", "keywords","genre"],  # 추천 후보 아이템들의 특성 리스트
    data=formatted_data
)

In [None]:
reranker.diversity_rerank()

Requesting...
Raw Response from LLM: [527, 608, 356, 296, 1307, 2028, 1, 2797, 1265, 1198]; Prioritizing diversity by ensuring items from different genre categories are placed at the top of the list; stop
Response correct:  stop
Reasoning: Prioritizing diversity by ensuring items from different genre categories are placed at the top of the list


([527, 608, 356, 296, 1307, 2028, 1, 2797, 1265, 1198],
 [['accuracy', 'fairness', 'diversity'],
  [[1265, 1307, 1198, 2797, 356, 296, 608, 527, 2028, 1],
   [2797, 1, 2028, 608, 356, 296, 1307, 1198, 527, 1265],
   [527, 608, 356, 296, 1307, 2028, 1, 2797, 1265, 1198]]],
 [8, 6, 4, 1, 0, 2, 5, 3, 7, 9])

### **alpha-NDCG, MAD 계산**

In [None]:
rank4= [527, 608, 356, 296, 1307, 2028, 1, 2797, 1265, 1198]
print('About Genre')
print(alpha_ndcg(rank4, relevance_matrix_genre, alpha=0.7))
print(calculate_mad(rank4, relevance_matrix_genre))
print('\nAbout Release Year')
print(alpha_ndcg(rank4, relevance_matrix_year, alpha=0.7))
print(calculate_mad(rank4, relevance_matrix_year))

About Genre
alpha-nDCG:
0.86794
MAD:
0.33939

About Release Year
alpha-nDCG:
1.0
MAD:
0.34074
