In [41]:
import time
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import math

import csv
import pandas as pd
from tqdm import tqdm  # tqdm 라이브러리 임포트
from tqdm.auto import trange
import random
from multiprocessing import Pool
import pickle
from scipy.spatial.distance import pdist, squareform
from torch_geometric.nn import LGConv

In [42]:
def get_data(path = "/content/drive/MyDrive/학교/졸업작품/"):

    business_info_file = path + 'business_info.csv' # 필라델피아 가게 정보 business_id, latitude, longitude, city, idx

    business_location = []
    with open(business_info_file, 'r', newline='') as business_file:
        csv_reader = csv.reader(business_file)
        next(csv_reader)  # 헤더 행 건너뛰기
        for row in csv_reader:
            _, latitude, longitude, _, _ = row[0], row[1], row[2], row[3].lower(), row[4] #city : 소문자로 받음

            business_location.append([latitude, longitude])
    business_location = np.array(business_location, dtype=float)

    input_file = path + "reviews.txt"
    data = []
    with open(input_file, 'r', newline='', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            data.append(row)

    if not data:  # 데이터가 비어있는 경우 처리
        return

    user_history_list = []
    user_reviews_list = []
    user_ratings_list = []
    user_review_emb_list = []

    tmp_reviews = []
    tmp_ratings = []
    tmp_business_id = []
    tmp_review_emb_list = []

    before_user_id = data[0][0]  # 첫 번째 사용자 ID로 초기화
    for idx, i in enumerate(data):
        user_id, business_id, rating, review = i
        check_rating = float(rating) > 3.0
        if user_id == before_user_id:
            tmp_business_id.append(int(business_id))
            tmp_ratings.append(float(rating))
            tmp_reviews.append(review)
            tmp_review_emb_list.append((idx,check_rating))
        else:
            if len(tmp_business_id) >= 10:  # 방문 횟수가 10회가 넘는 유저만 append
                user_history_list.append(tmp_business_id)
                user_ratings_list.append(tmp_ratings)
                user_reviews_list.append(tmp_reviews)
                user_review_emb_list.append(tmp_review_emb_list)
            tmp_business_id = [int(business_id)]
            tmp_ratings = [float(rating)]
            tmp_reviews = [review]
            tmp_review_emb_list = [(idx, check_rating)]

            before_user_id = user_id  # 현재 사용자 ID로 업데이트

    # 마지막 사용자 처리
    if len(tmp_business_id) >= 10:
        user_history_list.append(tmp_business_id)
        user_ratings_list.append(tmp_ratings)
        user_reviews_list.append(tmp_reviews)
        user_review_emb_list.append(tmp_review_emb_list)
    print(len(user_history_list), len(user_ratings_list), len(user_reviews_list), len(user_review_emb_list))


    # POI가 가진 리뷰 임베딩을 획득하기 위해
    # history_list를 기준으로 POI에 방문한 사람들 list 생성
    poi_visited_list = []
    for user,history in enumerate(user_history_list):
        for idx, poi in enumerate(history):
            poi_visited_list.append([int(user), int(poi), float(user_ratings_list[user][idx]), user_reviews_list[user][idx], user_review_emb_list[user][idx]])

    poi_visited_list.sort(key = lambda x:x[1]) # poi 번호 순으로 정렬

    item_history_list = []
    item_reviews_list = []
    item_ratings_list = []
    item_review_emb_list = []


    tmp_reviews = []
    tmp_ratings = []
    tmp_user_id = []
    tmp_review_emb = []
    before_poi_id = poi_visited_list[0][1]  # 첫 번째 POI ID로 초기화

    for idx, i in enumerate(poi_visited_list):
        user_id, business_id, rating, review, review_emb = i[0], i[1], i[2], i[3], i[4]
        if business_id == before_poi_id: # 이전 POI Id와 동일하다면
            tmp_user_id.append(user_id)
            tmp_ratings.append(rating)
            tmp_reviews.append(review)
            tmp_review_emb.append(review_emb)
        else: # 이전 POI ID와 다른 POI라면
            #print(business_id)
            # 이전 POI 정보 안에 있던거 다 추가하고
            item_history_list.append(tmp_user_id)
            item_ratings_list.append(tmp_ratings)
            item_reviews_list.append(tmp_reviews)
            item_review_emb_list.append(tmp_review_emb)

            if int(business_id) - int(before_poi_id) > 1:
                for _ in range(int(business_id) - int(before_poi_id) - 1):
                    #print(f"방문 기록이 없는 POI는 PASS")
                    item_history_list.append([])
                    item_ratings_list.append([])
                    item_reviews_list.append([])
                    item_review_emb_list.append([])

            tmp_user_id = [user_id]
            tmp_ratings = [rating]
            tmp_reviews = [review]
            tmp_review_emb = [review_emb]

            before_poi_id = business_id  # 현재 사용자 ID로 업데이트

    item_history_list.append(tmp_business_id)
    item_ratings_list.append(tmp_ratings)
    item_reviews_list.append(tmp_reviews)
    item_review_emb_list.append(tmp_review_emb)

    print(len(item_history_list), len(item_ratings_list), len(item_reviews_list), len(item_review_emb_list))


    embedding_file = path + 'embeddings.npy'
    embeddings = np.load(embedding_file, mmap_mode='r')

    user_review_embs = []
    for poi, embeds in enumerate(user_review_emb_list):
        if len(embeds)>0: # 비어있지 않으면
            # 기존
            new_array = np.array([embeddings[idx] for idx, check_rating in embeds])
            new_array = np.mean(new_array, axis = 0)

            # # 변경
            # temp_list = []
            # for idx, check_rating in embeds:
            #     if check_rating:
            #         temp_list.append(embeddings[idx])
            # if len(temp_list):
            #     new_array = np.array(temp_list)
            #     new_array = np.mean(new_array, axis = 0)
            # else:
            #     new_array = np.zeros(768, dtype=np.float32)
        else:
            new_array = np.zeros(768, dtype=np.float32)
        user_review_embs.append(new_array)

    item_review_embs = []
    for poi, embeds in enumerate(item_review_emb_list):
        if len(embeds)>0: # 비어있지 않으면
            new_array = np.array([embeddings[idx] for idx, check_rating in embeds])
            new_array = np.mean(new_array, axis = 0)
        else:
            new_array = np.zeros(768, dtype=np.float32)

        item_review_embs.append(new_array.tolist())

    return user_history_list, user_ratings_list, user_reviews_list, user_review_embs, item_history_list, item_ratings_list, item_reviews_list, item_review_embs, business_location

In [43]:
class Yelp(Dataset):
    def __init__(self):
        """
        Yelp 데이터셋을 로드하고 학습 데이터와 테스트 데이터를 생성합니다.

        Args:
            dir (str): 데이터 파일이 있는 디렉토리 경로.
            splitter (str): 파일에서 열을 구분하는 구분자.
            K (int): K 값, 즉 각 사용자마다 테스트에 사용되는 상호작용의 수.
        """
        self.user_history, _, _, user_review_embeds ,_,_,_,poi_review_embeds, business_location = get_data()
        self.norm_distances = self.normalize_distances(self.calculate_distances(business_location))
        print(f"Number of users: {len(self.user_history)}")
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.train = []
        self.test = []
        self.poi_review_embeds = torch.tensor(poi_review_embeds).to(self.device)
        self.user_review_embeds = torch.tensor(user_review_embeds).to(self.device)
        #self.num_user = len(user_history_list)
        self.num_item = len(poi_review_embeds) # 14585

        self.num_users = len(self.user_history)
        self.num_items = max(max(hist) for hist in self.user_history) + 1

        self.train_data, self.test_data = self.split_data()
        self.train_edge_index = self.get_edge_index(self.train_data)
        self.test_edge_index = self.get_edge_index(self.test_data)
        print(f"Number of users in train_data: {len(self.train_data)}")
        print(f"Number of users: {self.num_users}")
        print(f"Number of items: {self.num_items}")
        print(f"Edge index shape: {self.train_edge_index.shape}")
        items = [i for i in range(self.num_item)]
        self.neg = dict()

        random.seed(30)
        for u, hist in enumerate(self.user_history):
            random.shuffle(hist)
            self.train.append(hist[:int(len(hist) * 0.7)])
            self.test.append(hist[int(len(hist) * 0.7) :])

            u_negs = set(items) - set(hist)
            self.neg[u] = list(u_negs) # ng dataset 생성

        self.index_map = []
        for u, user_items in enumerate(self.train):
            for i in user_items:
                self.index_map.append((u, i))

    def split_data(self):
        train_data = []
        test_data = []
        for user_hist in self.user_history:
            np.random.shuffle(user_hist)
            split = int(len(user_hist) * 0.8)
            train_data.append(user_hist[:split])
            test_data.append(user_hist[split:])
        return train_data, test_data

    def get_edge_index(self, data):
        user_ids = []
        item_ids = []
        for user, items in enumerate(data):
            user_ids.extend([user] * len(items))
            item_ids.extend(items)
        edge_index = torch.tensor([user_ids, item_ids], dtype=torch.long)
        return edge_index.to(self.device)  # device를 지정해주세요
    
    def __len__(self):
        """
        데이터셋의 사용자 수를 반환합니다.
        """
        #return self.num_user
        return len(self.train_data)

    def __getitem__(self, idx):
        # if idx >= len(self.train_data):
        #     raise IndexError(f"Index {idx} out of range for train_data with length {len(self.train_data)}")
        # pos_items = self.train_data[idx]
        # if not pos_items:
        #     pos_items = [random.randint(0, self.num_items - 1)]  # 빈 리스트인 경우 랜덤 아이템 선택
        # neg_items = list(set(range(self.num_items)) - set(self.user_history[idx]))
        # neg_item = random.choice(neg_items)
        # return idx, random.choice(pos_items), neg_item
        u, i = self.index_map[idx]
        # 부정적인 아이템 무작위 선택
        j = random.choice(self.neg[u])
        return (u, i, j)
    
    def haversine(self, lat1, lon1, lat2, lon2):
        R = 6371
        dlat = np.radians(lat2 - lat1)
        dlon = np.radians(lon1 - lon2)  # Note the change here
        a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        d = R * c
        return d

    def calculate_distances(self, poi_data):
        t = time.time()
        distances = squareform(pdist(poi_data, lambda u, v: self.haversine(u[0], u[1], v[0], v[1])))
        print(f"calculate_distance time : {int(time.time()-t)}")
        return distances

    def normalize_distances(self, distances):
        min_d = np.min(distances)
        max_d = np.max(distances)
        norm_distances = 0.5 * (distances - min_d) / (max_d - min_d) + 0.5
        return norm_distances

In [44]:
def bpr_loss(user_emb, pos_item_emb, neg_item_emb, distance_ij, lambda_reg=1e-6):
    pos_scores = (user_emb * pos_item_emb).sum(dim=1)
    neg_scores = (user_emb * neg_item_emb).sum(dim=1)

    loss = -torch.log(torch.sigmoid(distance_ij*(pos_scores - neg_scores))).sum()
    reg_loss = lambda_reg * (user_emb.norm(2).pow(2) +
                             pos_item_emb.norm(2).pow(2) +
                             neg_item_emb.norm(2).pow(2))

    return loss + reg_loss

In [45]:
def evaluate(model, dataset, k_values=[5, 10, 20], text=False):
    model.eval()

    device = next(model.parameters()).device
    user_review_embeds = dataset.user_review_embeds.to(device)
    poi_review_embeds = dataset.poi_review_embeds.to(device)

    with torch.no_grad():
        if text:
            user_emb, item_emb = model(dataset.test_edge_index, user_review_embeds, poi_review_embeds)
        else:
            user_emb, item_emb = model(dataset.test_edge_index)

    scores = torch.matmul(user_emb, item_emb.t())

    results = {k: {'hit_ratio': 0, 'recall': 0, 'precision': 0} for k in k_values}
    num_users = 0

    for user in range(dataset.num_users):
        test_items = dataset.test_data[user]
        if not test_items:
            continue

        num_users += 1
        user_scores = scores[user].cpu().detach()

        # train_user_history = dataset.train_data[user]
        # user_scores[train_user_history] = -np.inf

        _, top_k_items = torch.topk(user_scores, max(k_values))
        
        for k in k_values:
            hit = len(set(top_k_items[:k].numpy()) & set(test_items))
            results[k]['hit_ratio'] += (hit > 0)
            results[k]['recall'] += hit / len(test_items)
            results[k]['precision'] += hit / k

    for k in k_values:
        results[k]['hit_ratio'] /= num_users
        results[k]['recall'] /= num_users
        results[k]['precision'] /= num_users

    return results

In [46]:
def train(model, dataset, num_epochs=100, batch_size=1024, lr=0.001, text=False, patience=20):
    device = next(model.parameters()).device
    optimizer = optim.Adam(model.parameters(), lr=lr)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    user_review_embeds = dataset.user_review_embeds.to(device)
    poi_review_embeds = dataset.poi_review_embeds.to(device)

    norm_distances = dataset.norm_distances
    
    # Lists to store metrics
    losses = []
    hit_ratios = {k: [] for k in [5, 10, 20]}
    recalls = {k: [] for k in [5, 10, 20]}
    precisions = {k: [] for k in [5, 10, 20]}

    # Dictionary to store best metrics
    best_metrics = {k: {'recall': 0, 'precision': 0, 'hit_ratio': 0, 'epoch': 0} for k in [5, 10, 20]}

    # Early stopping variables
    best_recall_10 = 0
    epochs_no_improve = 0
    early_stop = False

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for users, pos_items, neg_items in tqdm(data_loader):
            optimizer.zero_grad()

            users = users.to(device)
            pos_items = pos_items.to(device)
            neg_items = neg_items.to(device)
            if text:
                user_emb, item_emb = model(dataset.train_edge_index, user_review_embeds, poi_review_embeds)
            else:
                user_emb, item_emb = model(dataset.train_edge_index)

            user_emb = user_emb[users]
            pos_item_emb = item_emb[pos_items]
            neg_item_emb = item_emb[neg_items]

            # distance
            distance_ij = torch.tensor(norm_distances[pos_items, neg_items]).to(device)
            
            loss = bpr_loss(user_emb, pos_item_emb, neg_item_emb, distance_ij)

            loss.backward()
            total_loss += loss.item()
            optimizer.step()

        avg_loss = total_loss / len(data_loader)
        losses.append(avg_loss)
        
        # 검증
        results = evaluate(model, dataset, text=text)

        # print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}")
        for k in results.keys():
            hit_ratios[k].append(results[k]['hit_ratio'])
            recalls[k].append(results[k]['recall'])
            precisions[k].append(results[k]['precision'])

            # Update best metrics if current recall is better
            if results[k]['recall'] > best_metrics[k]['recall']:
                best_metrics[k]['recall'] = results[k]['recall']
                best_metrics[k]['precision'] = results[k]['precision']
                best_metrics[k]['hit_ratio'] = results[k]['hit_ratio']
                best_metrics[k]['epoch'] = epoch + 1

            # print(f"HR@{k}: {results[k]['hit_ratio']:.6f}, Recall@{k}: {results[k]['recall']:.6f}, Precision@{k}: {results[k]['precision']:.6f}")

        # Early stopping check
        # racall@10기준 더이상 안오르면 중단
        if results[10]['recall'] > best_recall_10:
            best_recall_10 = results[10]['recall']
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve == patience:
            # print(f"Early stopping triggered. No improvement in Recall@10 for {patience} epochs.")
            early_stop = True
            break

    # Print best metrics
    print("\nBest performance:")
    for k, metrics in best_metrics.items():
        print(f"k={k} (Epoch {metrics['epoch']}):")
        print(f"  Best Recall@{k}: {metrics['recall']:.6f}")
        print(f"  Corresponding HR@{k}: {metrics['hit_ratio']:.6f}")
        print(f"  Corresponding Precision@{k}: {metrics['precision']:.6f}")

    if early_stop:
        print(f"Training stopped early at epoch {epoch+1}")
    else:
        print("Training completed for all epochs")

    return best_metrics

In [47]:
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, num_layers=1, embedding_dim=128):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_layers = num_layers

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.convs = nn.ModuleList([LGConv() for _ in range(num_layers)])

        self.init_parameters()

    def init_parameters(self):
        nn.init.xavier_normal_(self.user_embedding.weight, gain=0.1)
        nn.init.xavier_normal_(self.item_embedding.weight, gain=0.1)

    def forward(self, edge_index):
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight

        # 양방향 엣지 생성
        row, col = edge_index
        edge_index = torch.cat([edge_index, torch.stack([col, row])], dim=1)

        x = torch.cat([user_emb, item_emb], dim=0)
        emb_list = [x]
        for conv in self.convs:
            x = conv(x, edge_index)
            emb_list.append(x)

        emb = torch.stack(emb_list, dim=1).mean(dim=1)
        user_emb, item_emb = torch.split(emb, [self.num_users, self.num_items], dim=0)
        return user_emb, item_emb

In [48]:
class TextLightGCN(nn.Module):
    def __init__(self, num_users, num_items, num_layers=1, embedding_dim=128, text_embedding_dim=768, alpha=0.1):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_layers = num_layers
        self.alpha = alpha

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.latent_convs = nn.ModuleList([LGConv() for _ in range(num_layers)])
        # self.text_convs = nn.ModuleList([LGConv() for _ in range(num_layers)])

        self.init_parameters()

    def init_parameters(self):
        nn.init.xavier_normal_(self.user_embedding.weight, gain=0.1)
        nn.init.xavier_normal_(self.item_embedding.weight, gain=0.1)

    def forward(self, edge_index, user_text_emb, item_text_emb):
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight

        user_text_emb = user_text_emb / math.sqrt(768)
        item_text_emb = item_text_emb / math.sqrt(768)

        # Create bidirectional edges
        row, col = edge_index
        edge_index = torch.cat([edge_index, torch.stack([col, row])], dim=1)

        # Process latent embeddings through their own LightGCN
        latent_x = torch.cat([user_emb, item_emb], dim=0)
        latent_emb_list = [latent_x]
        for conv in self.latent_convs:
            latent_x = conv(latent_x, edge_index)
            latent_emb_list.append(latent_x)
        latent_emb = torch.stack(latent_emb_list, dim=1).mean(dim=1)
        latent_user_emb, latent_item_emb = torch.split(latent_emb, [self.num_users, self.num_items], dim=0)

        # # Process text projections through their own LightGCN
        # text_x = torch.cat([user_text_proj, item_text_proj], dim=0)
        # text_emb_list = [text_x]
        # for conv in self.text_convs:
        #     text_x = conv(text_x, edge_index)
        #     text_emb_list.append(text_x)
        # text_emb = torch.stack(text_emb_list, dim=1).mean(dim=1)
        # text_user_emb, text_item_emb = torch.split(text_emb, [self.num_users, self.num_items], dim=0)

        #print(latent_user_emb.shape, user_text_emb.shape)
        #print(latent_item_emb.shape, item_text_emb.shape)


        # Concatenate final user and item embeddings
        final_user_emb = torch.cat([self.alpha * latent_user_emb , (1-self.alpha) * user_text_emb], dim=1)
        final_item_emb = torch.cat([latent_item_emb, item_text_emb], dim=1)

        return final_user_emb, final_item_emb

In [49]:
np.random.seed(30)
yelp = Yelp()

37685 37685 37685 37685
14585 14585 14585 14585
Number of users: 37685
Number of users in train_data: 37685
Number of users: 37685
Number of items: 14585
Edge index shape: torch.Size([2, 470029])


In [50]:
# 기존 한 유저당 1개의 긍정, 부정 데이터셋 #406548
len(yelp)

37685

In [51]:
def hyperparameter_tuning(model_class, dataset, lr_values, num_epochs, batch_size, text):
    results = []
    best_lr = None
    best_recall_10 = 0
    
    for lr in lr_values:
        print(f"\nTraining with learning rate: {lr}")
        model = model_class(dataset.num_users, dataset.num_items).to('cuda' if torch.cuda.is_available() else 'cpu')
        best_metrics = train(model, dataset, num_epochs=num_epochs, batch_size=batch_size, lr=lr, text=text)
        
        for k, metrics in best_metrics.items():
            results.append({
                'model': model_class.__name__,
                'lr': lr,
                'k': k,
                'recall': metrics['recall'],
                'precision': metrics['precision'],
                'hit_ratio': metrics['hit_ratio'],
                'epoch': metrics['epoch']
            })
            
            # Recall@10을 기준으로 최고의 lr 저장
            if k == 10 and metrics['recall'] > best_recall_10:
                best_recall_10 = metrics['recall']
                best_lr = lr
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    
    # Save results to CSV
    filename = f"{model_class.__name__}_results.csv"
    df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")
    
    return best_lr

# 학습률 값들 정의
lr_values = [0.01, 0.005, 0.001, 0.0005, 0.0001]

# 기본 LightGCN
best_lr_lightgcn = hyperparameter_tuning(LightGCN, yelp, lr_values, num_epochs=100, batch_size=32, text=False)

# TextLightGCN
best_lr_textlightgcn = hyperparameter_tuning(TextLightGCN, yelp, lr_values, num_epochs=300, batch_size=32, text=True)

print(f"Best learning rate for LightGCN: {best_lr_lightgcn}")
print(f"Best learning rate for TextLightGCN: {best_lr_textlightgcn}")


Training with learning rate: 0.01


100%|██████████| 1177/1177 [00:25<00:00, 46.03it/s]
100%|██████████| 1177/1177 [00:24<00:00, 48.28it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.53it/s]
100%|██████████| 1177/1177 [00:23<00:00, 50.46it/s]
100%|██████████| 1177/1177 [00:23<00:00, 50.73it/s]
100%|██████████| 1177/1177 [00:23<00:00, 50.64it/s]
100%|██████████| 1177/1177 [00:23<00:00, 50.75it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.10it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.10it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.35it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.20it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.26it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.76it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.52it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.27it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.31it/s]
100%|██████████| 1177/1177 [00:24<00:00, 48.89it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.15it/s]
100%|██████████| 1177/1177 [00:23<00:00, 49.24it/s]
100%|███████

KeyboardInterrupt: 

In [None]:
def hyperparameter_tuning_alpha(dataset, alpha_values, num_epochs=300, batch_size=32, lr=0.01):
    results = []
    
    for alpha in alpha_values:
        print(f"\nTraining with alpha: {alpha}")
        model = TextLightGCN(dataset.num_users, dataset.num_items, alpha=alpha).to('cuda' if torch.cuda.is_available() else 'cpu')
        
        # train 함수를 수정하여 alpha 값을 전달할 수 있도록 합니다.
        best_metrics = train(model, dataset, num_epochs=num_epochs, batch_size=batch_size, lr=lr, text=True)
        
        for k, metrics in best_metrics.items():
            results.append({
                'model': 'TextLightGCN',
                'alpha': alpha,
                'k': k,
                'recall': metrics['recall'],
                'precision': metrics['precision'],
                'hit_ratio': metrics['hit_ratio'],
                'epoch': metrics['epoch'],
                'lr': lr
            })
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    
    # Save results to CSV
    filename = "TextLightGCN_alpha_tuning_results.csv"
    df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")

# alpha 값들 정의
alpha_values = [0.1, 0.5, 0.9]

# TextLightGCN에 대한 alpha 튜닝 실행
hyperparameter_tuning_alpha(yelp, alpha_values, num_epochs=300, batch_size=32, lr=best_lr_textlightgcn)

In [None]:
def hyperparameter_tuning_embedding_dim(model_class, dataset, embedding_dims, num_epochs=300, batch_size=32, lr=0.01, text=True):
    results = []
    
    for embedding_dim in embedding_dims:
        print(f"\nTraining with embedding dimension: {embedding_dim}")
        model = model_class(dataset.num_users, dataset.num_items, embedding_dim=embedding_dim).to('cuda' if torch.cuda.is_available() else 'cpu')
        
        # train 함수를 수정하여 embedding_dim 값을 전달할 수 있도록 합니다.
        best_metrics = train(model, dataset, num_epochs=num_epochs, batch_size=batch_size, lr=lr, text=text)
        
        for k, metrics in best_metrics.items():
            results.append({
                'model': model_class.__name__,
                'embedding_dim': embedding_dim,
                'k': k,
                'recall': metrics['recall'],
                'precision': metrics['precision'],
                'hit_ratio': metrics['hit_ratio'],
                'epoch': metrics['epoch'],
                'lr': lr
            })
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    
    # Save results to CSV
    filename = f"{model_class.__name__}_embedding_dim_tuning_results.csv"
    df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")

# 임베딩 차원 값들 정의
embedding_dims = [64, 128, 256, 896]
embedding_dims_text = [64, 128, 256]

# 기본 LightGCN에 대한 임베딩 차원 튜닝 실행
hyperparameter_tuning_embedding_dim(LightGCN, yelp, embedding_dims, num_epochs=100, batch_size=32, lr=best_lr_lightgcn, text=False)

# TextLightGCN에 대한 임베딩 차원 튜닝 실행
hyperparameter_tuning_embedding_dim(TextLightGCN, yelp, embedding_dims_text, num_epochs=300, batch_size=32, lr=best_lr_textlightgcn, text=True)