유사 아이템 수 조정

In [4]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive', force_remount=True)

# 데이터 파일 경로 지정
file_path = '/content/drive/MyDrive/movie.csv'
df = pd.read_csv(file_path)

# 사용자-아이템 매트릭스 생성
user_item_matrix = df.pivot_table(index='critic_name', columns='movie_id', values='review_score_cleaned').fillna(0)
user_item_matrix = user_item_matrix.astype(np.float32)  # 데이터 타입 최적화
user_item_sparse = csr_matrix(user_item_matrix)  # 희소 행렬 변환

# 아이템 간 유사도 계산
item_similarity = cosine_similarity(user_item_sparse.T)

# 평점 예측 함수
def predict_ratings(user_id, user_item_matrix, item_similarity, top_k_items=1500, similarity_threshold=0.2):
    user_ratings = user_item_matrix.loc[user_id]
    predicted_ratings = []

    for item_id in range(user_item_matrix.shape[1]):
        item_similarities = item_similarity[item_id]

        # Threshold 필터링
        item_similarities[item_similarities < similarity_threshold] = 0

        # 상위 K개의 유사 아이템 선택
        top_k_indices = np.argsort(item_similarities)[-top_k_items:]
        top_k_similarities = item_similarities[top_k_indices]
        top_k_ratings = user_ratings.iloc[top_k_indices]

        # 가중 평균 계산 (제곱 가중치 적용)
        weighted_sum = np.dot(top_k_similarities**2, top_k_ratings)
        norm_factor = np.sum(top_k_similarities**2)
        predicted_rating = weighted_sum / norm_factor if norm_factor > 0 else 0

        # 아이템 평균값 보정 추가
        item_mean = user_item_matrix.iloc[:, item_id].mean()
        predicted_rating += item_mean * 0.1

        predicted_ratings.append(predicted_rating)
    return pd.Series(predicted_ratings, index=user_item_matrix.columns)


# 추천 생성 함수
def recommend_items(user_id, user_item_matrix, item_similarity, top_k_items=1500, top_n=20, similarity_threshold=0.2, rating_threshold=0.5):
    predicted_ratings = predict_ratings(user_id, user_item_matrix, item_similarity, top_k_items, similarity_threshold)

    # Threshold 필터링: 예측 평점이 rating_threshold 이상인 항목만 고려
    predicted_ratings = predicted_ratings[predicted_ratings >= rating_threshold]

    # 사용자가 이미 평가한 항목 제외
    watched_items = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendable_items = predicted_ratings[~predicted_ratings.index.isin(watched_items)]

    # 상위 N개 추천
    return recommendable_items.sort_values(ascending=False).head(top_n)



# 평가지표 계산 함수
def calculate_metrics(user_id, user_item_matrix, predicted_ratings, top_n=20):
    actual_ratings = user_item_matrix.loc[user_id]
    watched_items = actual_ratings[actual_ratings > 0].index
    predicted_top_n = predicted_ratings.sort_values(ascending=False).head(top_n)

    # RMSE와 MAE 계산
    actual_values = actual_ratings.loc[watched_items]
    predicted_values = predicted_ratings.loc[watched_items]
    rmse = np.sqrt(mean_squared_error(actual_values, predicted_values))
    mae = mean_absolute_error(actual_values, predicted_values)

    # Precision, Recall, F1 Score 계산
    relevant_items = set(watched_items)
    recommended_items = set(predicted_top_n.index)
    hits = len(relevant_items & recommended_items)
    precision = hits / len(recommended_items) if len(recommended_items) > 0 else 0
    recall = hits / len(relevant_items) if len(relevant_items) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    # Return the calculated metrics
    return rmse, mae, precision, recall, f1_score # This line was added to fix the error


Mounted at /content/drive


In [8]:
# critic_name에 해당하는 user_id 찾기
user_id = 'Ben McEachen'  # 원하는 사용자 이름
if user_id not in user_item_matrix.index:
    raise ValueError(f"User '{user_id}' not found in user_item_matrix.")

print(f"'{user_id}'의 데이터가 발견되었습니다. 추천 영화를 생성합니다.")

# 추천 영화 생성
recommended_movies = recommend_items(
    user_id,
    user_item_matrix,
    item_similarity,
    top_k_items=1500,
    top_n=10,  # 추천할 영화 수
    similarity_threshold=0.01,
    rating_threshold=0.1
)

# 추천 결과 출력
if recommended_movies.empty:
    print(f"'{user_id}'에게 추천할 영화가 없습니다.")
else:
    print(f"'{user_id}'에게 추천된 영화 목록:")
    print(recommended_movies)


'Ben McEachen'의 데이터가 발견되었습니다. 추천 영화를 생성합니다.
'Ben McEachen'에게 추천할 영화가 없습니다.


In [None]:
# 추천 생성
recommendations = recommend_items(
    user_id,
    user_item_matrix,
    item_similarity,
    top_k_items=1500,  # 상위 유사 항목 개수 유지
    top_n=450,  # 추천 항목 수 약간 증가
    similarity_threshold=0.15,  # 유사도 필터링 강화
    rating_threshold=0.3  # 평점 임계값 완화
)

# 평가지표 계산
predicted_ratings = predict_ratings(user_id, user_item_matrix, item_similarity, top_k_items=1500, similarity_threshold=0.15)
rmse, mae, precision, recall, f1_score = calculate_metrics(user_id, user_item_matrix, predicted_ratings, top_n=450)

# 결과 출력
print("추천 항목:")
print(recommendations)
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


추천 항목:
Series([], dtype: float64)
RMSE: 0.4421
MAE: 0.4066
Precision: 0.6178
Recall: 0.4303
F1 Score: 0.5073
