In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# === STEP 1: 데이터 준비 ===
# 데이터 불러오기
poster = pd.read_csv('/content/drive/MyDrive/기학팀플/포스터/poster_with_cluster.csv')
data = pd.read_csv('/content/drive/MyDrive/기학팀플/movie.csv')

# poster와 data를 movie_title을 기준으로 병합
data = data.merge(poster[['movie_title', 'cluster']], on='movie_title', how='left')

# Train-Test Split
train, test = train_test_split(data, test_size=0.2, random_state=42)

# User-Item Matrix 생성
user_movie_matrix = train.pivot_table(index='critic_name', columns='movie_id', values='review_score_cleaned').fillna(0)

# 아이템 간 코사인 유사도 계산
item_movie_matrix = train.pivot_table(index='movie_id', columns='critic_name', values='review_score_cleaned').fillna(0)
item_similarity = cosine_similarity(item_movie_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=item_movie_matrix.index, columns=item_movie_matrix.index)

# === STEP 2: 아이템 기반 협업 필터링 추천 함수 ===
def item_based_recommend(user_name, user_movie_matrix, item_similarity_df, n_recommendations=5):
    if user_name not in user_movie_matrix.index:
        raise ValueError(f"User '{user_name}' not found in user_movie_matrix.")

    # 사용자가 평가한 아이템과 평점 추출
    user_ratings = user_movie_matrix.loc[user_name]

    # 아이템 기반 협업 필터링 점수 계산
    item_based_scores = item_similarity_df.dot(user_ratings).div(item_similarity_df.sum(axis=1))

    # 점수 정규화
    item_based_scores = (item_based_scores - item_based_scores.min()) / (item_based_scores.max() - item_based_scores.min())

    # 상위 점수 강조
    item_based_scores = item_based_scores**2

    # 이미 본 영화 제외
    seen_movies = user_ratings[user_ratings > 0].index
    item_based_scores = item_based_scores[~item_based_scores.index.isin(seen_movies)]

    # 상위 추천 영화 반환
    recommendations = item_based_scores.reset_index()
    recommendations.columns = ['movie_id', 'item_score']
    recommendations = recommendations.sort_values(by='item_score', ascending=False).head(n_recommendations)
    return recommendations

# === STEP 3: 사용자 추천 테스트 ===
user_name = "Ben McEachen"  # 추천을 받을 사용자 이름
n_recommendations = 5  # 추천할 영화 수
recommendations = item_based_recommend(user_name, user_movie_matrix, item_similarity_df, n_recommendations=n_recommendations)

# 영화 제목 추가
recommendations = recommendations.merge(data[['movie_id', 'movie_title']], on='movie_id', how='left')

# 중복 제거 (movie_title 기준)
recommendations = recommendations.drop_duplicates(subset=['movie_title'])

# 상위 N개 추천
recommendations = recommendations.sort_values(by='item_score', ascending=False).head(n_recommendations)

# 결과 출력
print(f"\nTop {n_recommendations} Item-Based Recommendations for {user_name}:")
print(recommendations[['movie_title', 'item_score']])



Top 5 Item-Based Recommendations for Ben McEachen:
                                 movie_title  item_score
0                          The Grace of Jake    0.604938
1                        Grown Up Movie Star    0.092281
3                           Accidents Happen    0.070635
10    San suk si gin (The Shinjuku Incident)    0.043407
12  A Complete History of My Sexual Failures    0.043178
