# Memory-based Collaborative Filtering

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

In [2]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.
    
    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions
    
    Returns:
    - recall: Recall@5 value
    """
    
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    
    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    
    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    
    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    
    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    
    
    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함 
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    
    return recall

In [3]:
view_log_train = pd.read_csv('data/view_log.csv')
article_info = pd.read_csv('data/article_info.csv')
submission = pd.read_csv('data/sample_submission.csv')

# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

# 사용자 간의 유사성 계산
user_similarity = cosine_similarity(user_article_matrix)
item_similarity = cosine_similarity(user_article_matrix.T)

# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix)
item_predicted_scores = user_article_matrix.dot(item_similarity)

alpha = 0.3
predicted_scores = alpha * user_predicted_scores + (1-alpha) * item_predicted_scores

predicted_scores.columns = user_article_matrix.columns

# 각 사용자별 상위 5개 기사 추출 함수 정의
def top5_article(df):
    df_freq = df.value_counts("articleID", ascending=False)[:5]
    return df_freq

df_freq = view_log_train.groupby("userID").apply(top5_article).reset_index()

# 각 userID 별로 5개의 행을 가지도록 부족한 행 추가
user_groups = df_freq.groupby('userID')
new_rows = []

for userID, group in user_groups:
    article_list = group['articleID'].to_list() # 각 사용자가 조회한 기사 리스트
    temp = predicted_scores.loc[userID] # 해당 사용자의 기사별 예측 점수
    article_list = list(set(temp.index) - set(article_list)) # 예측 점수 데이터에서 사용자가 이미 조회한 기사 제외
    temp = temp.loc[article_list].sort_values(ascending=False) # 예측 점수가 높은 순서로 정렬
    if len(group) < 5:
        for _ in range(5 - len(group)):
            new_rows.append({'userID': userID, 'articleID': temp.index[_], 'count': None}) # 부족한 행 만큼 새로운 행 추가

# 새로운 행을 기존 데이터프레임에 추가
df_freq = pd.concat([df_freq, pd.DataFrame(new_rows)], axis=0)

# 정렬
df_freq = df_freq.sort_values(by=['userID'], ignore_index=True)

submission = df_freq[["userID", "articleID"]].reset_index(drop=True)
submission.to_csv(f'CF_alpha{alpha}.csv', index=False)

submission

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_2255
1,USER_0000,ARTICLE_0411
2,USER_0000,ARTICLE_2834
3,USER_0000,ARTICLE_1033
4,USER_0000,ARTICLE_2316
...,...,...
7070,USER_1420,ARTICLE_0030
7071,USER_1420,ARTICLE_0714
7072,USER_1420,ARTICLE_0614
7073,USER_1420,ARTICLE_1848
