In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# 데이터 불러오기
poster = pd.read_csv('/content/drive/MyDrive/기학팀플/포스터/poster_with_cluster.csv')
data = pd.read_csv('/content/drive/MyDrive/기학팀플/movie.csv')


In [None]:

# poster와 data를 movie_title을 기준으로 병합
data = data.merge(poster[['movie_title', 'cluster']], on='movie_title', how='left')

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 필요한 열 정의
essential_cols = ['critic_name', 'movie_id', 'movie_title', 'review_score_cleaned', 'runtime', 'audience_rating',
                  'tomatometer_rating', 'predicted_emotion', 'movie_info_keyword', 'audience_status_Spilled',
                  'audience_status_Upright', 'cluster'] + [col for col in data.columns if col.startswith('genre_')]
data = data[essential_cols]

# 결측값 처리: loc 사용
data.loc[:, 'review_score_cleaned'] = data['review_score_cleaned'].fillna(data['review_score_cleaned'].mean())
data.loc[:, 'runtime'] = data['runtime'].fillna(data['runtime'].mean())
data.loc[:, 'audience_rating'] = data['audience_rating'].fillna(data['audience_rating'].mean())
data = data.dropna().reset_index(drop=True)  # 결측값 제거 후 인덱스 리셋

# predicted_emotion에 Label Encoding 적용
le = LabelEncoder()
data['predicted_emotion'] = le.fit_transform(data['predicted_emotion'])

# TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=200)
tfidf_matrix = tfidf.fit_transform(data['movie_info_keyword'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# TF-IDF 결과와 기존 데이터 결합
data = pd.concat([data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
data = data.drop(columns=['movie_info_keyword']).reset_index(drop=True)  # 필요 없는 열 제거

# 콘텐츠 데이터 정의
numeric_cols = data.select_dtypes(include=[np.number]).columns  # 숫자형 열만 선택
content_data = data[numeric_cols].copy()  # 숫자형 데이터만 포함
content_data['movie_id'] = data['movie_id']
content_data['critic_name'] = data['critic_name']
content_data['movie_title'] = data['movie_title']


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def content_based_recommend_by_movie(movie_title, content_data, n_recommendations=5):
    # 데이터 크기 확인
    print(f"Initial Content Data Shape: {content_data.shape}")

    # 중복 제거
    content_data = content_data.drop_duplicates().reset_index(drop=True)
    print(f"After Dropping Duplicates: {content_data.shape}")

    # 영화 데이터 추출 (중복 제거)
    movie_data = content_data[content_data['movie_title'] == movie_title].drop_duplicates(subset=['movie_title'])
    if movie_data.empty:
        raise ValueError(f"Movie '{movie_title}' not found in content_data.")

    # 영화 특징 벡터 추출
    movie_features = movie_data.drop(columns=['critic_name', 'movie_id', 'movie_title']).values
    if movie_features.shape[0] != 1:
        raise ValueError(f"Expected a single movie feature vector for '{movie_title}', got {movie_features.shape[0]} rows.")

    content_features = content_data.drop(columns=['critic_name', 'movie_id', 'movie_title']).values

    # 데이터 크기 확인
    print(f"Content Features Shape: {content_features.shape}")
    print(f"Movie Features Shape: {movie_features.shape}")

    # 콘텐츠 유사도 계산
    content_scores = cosine_similarity(content_features, movie_features).ravel()
    print(f"Content Scores Shape: {content_scores.shape}")

    # 길이 확인
    if len(content_scores) != len(content_data):
        raise ValueError("Content scores 길이가 content_data와 일치하지 않습니다.")

    # 추천 영화 데이터프레임 생성
    content_recommendations = pd.DataFrame({
        'movie_id': content_data['movie_id'].values,
        'content_score': content_scores,
        'movie_title': content_data['movie_title'].values
    })

    # 입력 영화 제외
    content_recommendations = content_recommendations[content_recommendations['movie_title'] != movie_title]

    # 데이터 크기 확인 (중복 제거)
    content_recommendations = content_recommendations.drop_duplicates(subset=['movie_id'])

    # 상위 N개 추천 반환
    return content_recommendations.sort_values(by='content_score', ascending=False).head(n_recommendations)


# 영화 제목으로 추천 실행
movie_title = "Inception"
recommendations = content_based_recommend_by_movie(movie_title, content_data, n_recommendations=5)

# 결과 출력
print(f"\nMovies similar to '{movie_title}':")
print(recommendations[['movie_title', 'content_score']])


Initial Content Data Shape: (675698, 81)
After Dropping Duplicates: (675295, 81)
Content Features Shape: (675295, 78)
Movie Features Shape: (1, 78)
Content Scores Shape: (675295,)

Movies similar to 'Inception':
                    movie_title  content_score
637832           V for Vendetta       0.999943
653514  What Happened to Monday       0.999935
91794            Arabian Nights       0.999933
330753       Kong: Skull Island       0.999930
620859     The Towering Inferno       0.999925


In [None]:
def content_based_recommend_by_movie(movie_title, content_data, n_recommendations=5, debug=False):
    if debug:
        print(f"Initial Content Data Shape: {content_data.shape}")

    # 중복 제거
    content_data = content_data.drop_duplicates().reset_index(drop=True)
    if debug:
        print(f"After Dropping Duplicates: {content_data.shape}")

    # 영화 데이터 추출 (중복 제거)
    movie_data = content_data[content_data['movie_title'] == movie_title].drop_duplicates(subset=['movie_title'])
    if movie_data.empty:
        raise ValueError(f"Movie '{movie_title}' not found in content_data.")

    # 영화 특징 벡터 추출
    movie_features = movie_data.drop(columns=['critic_name', 'movie_id', 'movie_title']).values
    if movie_features.shape[0] != 1:
        raise ValueError(f"Expected a single movie feature vector for '{movie_title}', got {movie_features.shape[0]} rows.")

    content_features = content_data.drop(columns=['critic_name', 'movie_id', 'movie_title']).values

    if debug:
        print(f"Content Features Shape: {content_features.shape}")
        print(f"Movie Features Shape: {movie_features.shape}")

    # 콘텐츠 유사도 계산
    content_scores = cosine_similarity(content_features, movie_features).ravel()
    if debug:
        print(f"Content Scores Shape: {content_scores.shape}")

    # 추천 영화 데이터프레임 생성
    content_recommendations = pd.DataFrame({
        'movie_id': content_data['movie_id'].values,
        'content_score': content_scores,
        'movie_title': content_data['movie_title'].values
    })

    # 입력 영화 제외 및 중복 제거
    content_recommendations = content_recommendations[content_recommendations['movie_title'] != movie_title]
    content_recommendations = content_recommendations.drop_duplicates(subset=['movie_id'])

    return content_recommendations.sort_values(by='content_score', ascending=False).head(n_recommendations)


In [None]:
def evaluate_content_based_by_movie(test_data, content_data, n_recommendations=5, threshold=0.05):
    all_true = []
    all_predicted = []

    for movie_title in tqdm(test_data['movie_title'].unique(), desc="Evaluating", leave=True):
        movie_actual = test_data[test_data['movie_title'] == movie_title][['movie_id', 'review_score_cleaned']].set_index('movie_id')
        if movie_actual.empty:
            continue

        try:
            recommendations = content_based_recommend_by_movie(movie_title, content_data, n_recommendations=n_recommendations)
            recommendations = recommendations.set_index('movie_id')['content_score']
        except ValueError:
            continue

        predicted_ratings = recommendations.reindex(movie_actual.index).fillna(0)
        all_true.extend(movie_actual['review_score_cleaned'])
        all_predicted.extend(predicted_ratings)

    rmse = np.sqrt(mean_squared_error(all_true, all_predicted))
    mae = mean_absolute_error(all_true, all_predicted)
    precision = precision_score(np.array(all_true) > threshold, np.array(all_predicted) > threshold, average='macro', zero_division=1)
    recall = recall_score(np.array(all_true) > threshold, np.array(all_predicted) > threshold, average='macro', zero_division=1)
    f1 = f1_score(np.array(all_true) > threshold, np.array(all_predicted) > threshold, average='macro', zero_division=1)

    return {'RMSE': rmse, 'MAE': mae, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}


In [None]:

# 평가 실행
metrics = evaluate_content_based_by_movie(test, content_data, n_recommendations=5, threshold=0.05)

# 결과 출력
print("\nContent-Based Recommendation System Metrics:")
print(metrics)


Evaluating:  13%|█▎        | 1812/14488 [1:30:01<10:03:36,  2.86s/it]