In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [2]:
ratings_cols = ['user_id', 'item_id', 'rating', 'timestamp']
#read the file with "ratings_col" columns, "latin-1" encoding, "\t" sepratoe
df_ratings = pd.read_csv('u.data', sep='\t', names=ratings_cols, encoding='latin-1')
display(df_ratings)

In [3]:
df_ratings['user_id'].nunique()

In [4]:
# ستون‌ها: movie id | movie title | release date | video release date | IMDb URL |
#          unknown | Action | Adventure | Animation | Children's | Comedy | Crime |
#          Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery |
#          Romance | Sci-Fi | Thriller | War | Western
movies_cols = ['item_id', 'title', 're_date', 'vid_re_date', 'imdb_url'] + \
              [f'genre_{i}' for i in range(19)]

#read the file with "movies_cols" columns, "latin-1" encoding, "|" sepratoe
df_movies = pd.read_csv('u.item', sep='|', names=movies_cols, encoding='latin-1')
display(df_movies)

In [5]:
df_movies['item_id'].nunique()

In [6]:
df_movies.info()

In [7]:
genres = df_movies.columns[5:]
df_movies[genres] = df_movies[genres].apply(lambda x: x.astype(int))
df_movies['genres_list'] = df_movies[genres].apply(lambda x: [genre for genre, val in zip(genres, x) if val == 1], axis=1)

print("\nنمونه‌ای از فیلم‌ها با لیست ژانرها:")
print(df_movies[['item_id', 'title', 'genres_list']].head())

In [8]:
df_movies.columns

In [9]:
df_movies[genres]

In [10]:
df_merged = pd.merge(df_ratings, df_movies, on='item_id')

print("\nاطلاعات دیتافریم ترکیب شده (df_merged):")
print(df_merged.head(1))
print("\nتعداد ردیف‌های دیتافریم ترکیب شده:", len(df_merged))

In [11]:
user_movie_matrix = df_merged.pivot_table(index='user_id', columns='title', values='rating')

print("نمونه‌ای از ماتریس تعامل کاربر-فیلم (User-Item Matrix):")
display(user_movie_matrix.head(1))
print("\nابعاد ماتریس User-Item:", user_movie_matrix.shape)

In [12]:
movie_features = df_movies[genres]

print("نمونه‌ای از ویژگی‌های محتوایی فیلم‌ها (ژانرها):")
print(movie_features.head())

In [13]:
temp = []
for item in genres:
    temp.append(item + "_x")

In [14]:
print(temp)

In [15]:
def create_user_profile(user_id, ratings_df, movie_features_df, min_rating=4):

    # select the user_id data from ratings_df
    user_rated_movies = ratings_df[ratings_df['user_id'] == user_id]
    # select the 'rating' data that is greater than or equal to min_rating from user_rated_movies
    high_rated_movies = user_rated_movies[user_rated_movies['rating'] >= min_rating]

    if high_rated_movies.empty:
        # If the user has no highly rated movies, return a series of zeros
        return pd.Series(0, index=movie_features_df[genres].columns)

    high_rated_movie_features = pd.merge(high_rated_movies, movie_features_df, on='item_id')
    user_profile = high_rated_movie_features[genres].mean()

    return user_profile

In [16]:
sample_user_ids = df_ratings['user_id'].unique()[:2]
user_profiles = {}
for user_id in sample_user_ids:
    user_profiles[user_id] = create_user_profile(user_id, df_ratings, df_movies)

In [17]:
print("\nنمونه‌ای از پروفایل‌های کاربران:")
for user_id, profile in user_profiles.items():
    print(f"\nپروفایل کاربر {user_id}:")
    print(profile)

In [18]:
movie_features_indexed = df_movies.set_index('item_id')[genres]
movie_similarity_matrix = cosine_similarity(movie_features_indexed)

movie_similarity_df = pd.DataFrame(movie_similarity_matrix, index=movie_features_indexed.index, columns=movie_features_indexed.index)

print("\nنمونه‌ای از ماتریس شباهت فیلم‌ها:")
print(movie_similarity_df.iloc[:5, :5])

In [19]:
genres

In [20]:
def get_content_based_recommendations(user_id, num_recommendations=10):
    user_profile = create_user_profile(user_id, df_ratings, df_movies)

    # Movies that the user has previously watched
    watched_movies_ids = df_merged[df_merged['user_id'] == user_id]['item_id'].tolist()

    # Unwatched movies (that the user has not yet rated)
    unwatched_movies_df = df_movies[~df_movies['item_id'].isin(watched_movies_ids)]
    unwatched_movie_features = unwatched_movies_df.set_index('item_id')[genres]

    if unwatched_movie_features.empty or user_profile.isnull().all():
        return pd.DataFrame(columns=['title', 'content_score'])

    # Calculate the similarity between the user profile and the unseen movies
    # (The user profile is a 1xN vector and the movie features are an MxN matrix)
    # The result will be an Mx1 vector showing the similarity of each movie to the user profile.
    content_scores = np.dot(unwatched_movie_features, user_profile)

    content_scores_series = pd.Series(content_scores.flatten(), index=unwatched_movie_features.index)

    # Sorting movies based on content rating and selecting the best ones
    top_recommendations = content_scores_series.sort_values(ascending=False).head(num_recommendations)

    # add movie titles
    recommended_movies_info = df_movies[df_movies['item_id'].isin(top_recommendations.index)][['item_id', 'title']]
    recommended_movies_info = recommended_movies_info.set_index('item_id')
    recommended_movies_info['content_score'] = top_recommendations

    return recommended_movies_info.sort_values(by='content_score', ascending=False)

In [21]:
for i in range(5):
    user_id_to_test = np.random.randint(943)
    content_based_recs = get_content_based_recommendations(user_id_to_test)
    print(f"\nپیشنهادهای محتوا-محور برای کاربر {user_id_to_test}:")
    print(content_based_recs)
    print("--------------")

In [22]:
# User-Item Matrix Normalization (average user rating subtracted from the ranks)
user_movie_matrix_normalized = user_movie_matrix.apply(lambda x: x - x.mean())

# Padding NaN values with zeros to calculate similarity (these values are ignored in the similarity calculation)
user_movie_matrix_normalized_filled = user_movie_matrix_normalized.fillna(0)

# Calculating the user similarity matrix using cosine similarity
user_similarity_matrix = cosine_similarity(user_movie_matrix_normalized_filled)

user_similarity_df = pd.DataFrame(user_similarity_matrix,
                                  index=user_movie_matrix.index,
                                  columns=user_movie_matrix.index)

print("\nنمونه‌ای از ماتریس شباهت کاربران:")
print(user_similarity_df.iloc[:5, :5])

In [23]:
def predict_collaborative_rating(user_id, movie_title, user_movie_matrix, user_similarity_df, k=20):
    # movie_title is not in the matrix
    if movie_title not in user_movie_matrix.columns:
        return 0

    # Find the k closest users to user_id
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    # Delete the user from users
    similar_users = similar_users.drop(user_id)

    # Select the top k users (nearest neighbors)
    top_k_similar_users = similar_users.head(k)

    numerator = 0
    denominator = 0

    # Average target user rating
    user_mean_rating = user_movie_matrix.loc[user_id].mean()

    for sim_user_id, similarity_score in top_k_similar_users.items():
        # Check if a similar user has rated the desired movie
        if not pd.isna(user_movie_matrix.loc[sim_user_id, movie_title]):
            sim_user_rating = user_movie_matrix.loc[sim_user_id, movie_title]
            sim_user_mean_rating = user_movie_matrix.loc[sim_user_id].mean()

            numerator += similarity_score * (sim_user_rating - sim_user_mean_rating)
            denominator += abs(similarity_score)

    if denominator == 0:
        # If no similar user rated the movie, we return the average ratings of the target user.
        return user_mean_rating


    predicted_rating = user_mean_rating + (numerator / denominator)

    # Ensuring that the rating is between 1 and 5
    return max(1, min(5, predicted_rating))

In [24]:
for i in range(5):
    user_id_test_cf = np.random.randint(943)
    movie_title_test_cf = 'Toy Story (1995)'
    predicted_rating_cf = predict_collaborative_rating(user_id_test_cf, movie_title_test_cf, user_movie_matrix, user_similarity_df)
    print(f"\nرتبه پیش‌بینی شده مشارکتی برای کاربر {user_id_test_cf} و فیلم '{movie_title_test_cf}': {predicted_rating_cf:.2f}")

In [25]:
def get_collaborative_recommendations(user_id, num_recommendations=10):
    # فیلم‌هایی که کاربر قبلاً تماشا کرده است
    watched_movies = df_merged[df_merged['user_id'] == user_id]['title'].tolist()

    # فیلم‌های ندیده
    all_movies_titles = user_movie_matrix.columns.tolist()
    unwatched_movies_titles = [movie for movie in all_movies_titles if movie not in watched_movies]

    if not unwatched_movies_titles:
        return pd.DataFrame(columns=['title', 'collaborative_score'])

    collaborative_scores = {}
    for movie_title in unwatched_movies_titles:
        score = predict_collaborative_rating(user_id, movie_title, user_movie_matrix, user_similarity_df)
        collaborative_scores[movie_title] = score

    # تبدیل به DataFrame و مرتب‌سازی
    collaborative_recs_df = pd.DataFrame(list(collaborative_scores.items()), columns=['title', 'collaborative_score'])

    return collaborative_recs_df.sort_values(by='collaborative_score', ascending=False).head(num_recommendations)

In [26]:
for i in range(5):
    user_id_to_test_cf_recs = np.random.randint(943)
    collaborative_recs = get_collaborative_recommendations(user_id_to_test_cf_recs)
    print(f"\nپیشنهادهای مشارکتی برای کاربر {user_id_to_test_cf_recs}:")
    print(collaborative_recs)

In [27]:
def get_hybrid_recommendations(user_id, alpha=0.5, num_recommendations=10):

    # 1. Get content-based points
    # find the item_id that user watched them from df_merged in list format
    watched_movies_ids =  df_merged[df_merged['user_id'] == user_id]['item_id'].tolist()
    unwatched_movies_df = df_movies[~df_movies['item_id'].isin(watched_movies_ids)]
    unwatched_movie_features = unwatched_movies_df.set_index('item_id')[genres]

    if unwatched_movie_features.empty:
        return pd.DataFrame(columns=['title', 'final_score'])

    user_profile = create_user_profile(user_id, df_merged, df_movies)

    if user_profile.isnull().all():
        content_scores_series = pd.Series(0, index=unwatched_movie_features.index)
    else:
        content_scores = cosine_similarity(user_profile.values.reshape(1, -1), unwatched_movie_features)
        content_scores_series = pd.Series(content_scores.flatten(), index=unwatched_movie_features.index)

    # Normalize content-based scores to a range of 1 to 5
    # find the minimum score from content_score_series
    min_score = content_scores_series.min()
    # find the maximum score from content_score_series
    max_score = content_scores_series.max()
    if (max_score - min_score) != 0:
        content_scores_normalized = 1 + 4 * (content_scores_series - min_score) / (max_score - min_score)
    else:
        content_scores_normalized = pd.Series(3, index=content_scores_series.index)

    # 2. Receive participation points
    collaborative_scores = {}

    # اینجا تغییر اعمال می‌شود:
    unwatched_movie_ids = unwatched_movie_features.index.tolist()

    for item_id in unwatched_movie_ids:
        movie_title = df_movies[df_movies['item_id'] == item_id]['title'].iloc[0]
        collab_score = predict_collaborative_rating(user_id, movie_title, user_movie_matrix, user_similarity_df)
        collaborative_scores[item_id] = collab_score

    collaborative_scores_series = pd.Series(collaborative_scores)

    # Ensure both series have the same indexes (unwatched movies only)
    common_indices = unwatched_movie_features.index.intersection(collaborative_scores_series.index)

    content_scores_final = content_scores_normalized[common_indices]
    collaborative_scores_final = collaborative_scores_series[common_indices]

    # 3. Combining points
    final_scores = alpha * collaborative_scores_final + (1 - alpha) * content_scores_final

    recommended_movies_info = df_movies[df_movies['item_id'].isin(final_scores.index)][['item_id', 'title']]
    recommended_movies_info = recommended_movies_info.set_index('item_id')
    recommended_movies_info['final_score'] = final_scores

    return recommended_movies_info.sort_values(by='final_score', ascending=False).head(num_recommendations)

In [28]:
user_id_to_test_hybrid = 1
num_recommendations = 10

hybrid_recs_01 = get_hybrid_recommendations(user_id_to_test_hybrid, alpha=0.1, num_recommendations=num_recommendations)
hybrid_recs_05 = get_hybrid_recommendations(user_id_to_test_hybrid, alpha=0.5, num_recommendations=num_recommendations)
hybrid_recs_09 = get_hybrid_recs(user_id_to_test_hybrid, alpha=0.9, num_recommendations=num_recommendations)

submission = pd.DataFrame({
    'movie_0_1': hybrid_recs_01['title'].values,
    'score_0_1': hybrid_recs_01['final_score'].values,
    'movie_0_5': hybrid_recs_05['title'].values,
    'score_0_5': hybrid_recs_05['final_score'].values,
    'movie_0_9': hybrid_recs_09['title'].values,
    'score_0_9': hybrid_recs_09['final_score'].values,
})

display(submission)

In [29]:
def get_hybrid_recommendations(user_id, alpha=0.5, num_recommendations=10):

    # 1. Get content-based points
    # find the item_id that user watched them from df_merged in list format
    watched_movies_ids =  df_merged[df_merged['user_id'] == user_id]['item_id'].tolist()
    unwatched_movies_df = df_movies[~df_movies['item_id'].isin(watched_movies_ids)]
    unwatched_movie_features = unwatched_movies_df.set_index('item_id')[genres]

    if unwatched_movie_features.empty:
        return pd.DataFrame(columns=['title', 'final_score'])

    user_profile = create_user_profile(user_id, df_ratings, df_movies)

    if user_profile.isnull().all():
        content_scores_series = pd.Series(0, index=unwatched_movie_features.index)
    else:
        content_scores = cosine_similarity(user_profile.values.reshape(1, -1), unwatched_movie_features)
        content_scores_series = pd.Series(content_scores.flatten(), index=unwatched_movie_features.index)

    # Normalize content-based scores to a range of 1 to 5
    # find the minimum score from content_score_series
    min_score = content_scores_series.min()
    # find the maximum score from content_score_series
    max_score = content_scores_series.max()
    if (max_score - min_score) != 0:
        content_scores_normalized = 1 + 4 * (content_scores_series - min_score) / (max_score - min_score)
    else:
        content_scores_normalized = pd.Series(3, index=content_scores_series.index)

    # 2. Receive participation points
    collaborative_scores = {}

    # اینجا تغییر اعمال می‌شود:
    unwatched_movie_ids = unwatched_movie_features.index.tolist()

    for item_id in unwatched_movie_ids:
        movie_title = df_movies[df_movies['item_id'] == item_id]['title'].iloc[0]
        collab_score = predict_collaborative_rating(user_id, movie_title, user_movie_matrix, user_similarity_df)
        collaborative_scores[item_id] = collab_score

    collaborative_scores_series = pd.Series(collaborative_scores)

    # Ensure both series have the same indexes (unwatched movies only)
    common_indices = unwatched_movie_features.index.intersection(collaborative_scores_series.index)

    content_scores_final = content_scores_normalized[common_indices]
    collaborative_scores_final = collaborative_scores_series[common_indices]

    # 3. Combining points
    final_scores = alpha * collaborative_scores_final + (1 - alpha) * content_scores_final

    recommended_movies_info = df_movies[df_movies['item_id'].isin(final_scores.index)][['item_id', 'title']]
    recommended_movies_info = recommended_movies_info.set_index('item_id')
    recommended_movies_info['final_score'] = final_scores

    return recommended_movies_info.sort_values(by='final_score', ascending=False).head(num_recommendations)

In [30]:
user_id_to_test_hybrid = 1
num_recommendations = 10

hybrid_recs_01 = get_hybrid_recommendations(user_id_to_test_hybrid, alpha=0.1, num_recommendations=num_recommendations)
hybrid_recs_05 = get_hybrid_recommendations(user_id_to_test_hybrid, alpha=0.5, num_recommendations=num_recommendations)
hybrid_recs_09 = get_hybrid_recs(user_id_to_test_hybrid, alpha=0.9, num_recommendations=num_recommendations)

submission = pd.DataFrame({
    'movie_0_1': hybrid_recs_01['title'].values,
    'score_0_1': hybrid_recs_01['final_score'].values,
    'movie_0_5': hybrid_recs_05['title'].values,
    'score_0_5': hybrid_recs_05['final_score'].values,
    'movie_0_9': hybrid_recs_09['title'].values,
    'score_0_9': hybrid_recs_09['final_score'].values,
})

display(submission)

In [31]:
user_id_to_test_hybrid = 1
num_recommendations = 10

hybrid_recs_01 = get_hybrid_recommendations(user_id_to_test_hybrid, alpha=0.1, num_recommendations=num_recommendations)
hybrid_recs_05 = get_hybrid_recommendations(user_id_to_test_hybrid, alpha=0.5, num_recommendations=num_recommendations)
hybrid_recs_09 = get_hybrid_recommendations(user_id_to_test_hybrid, alpha=0.9, num_recommendations=num_recommendations)

submission = pd.DataFrame({
    'movie_0_1': hybrid_recs_01['title'].values,
    'score_0_1': hybrid_recs_01['final_score'].values,
    'movie_0_5': hybrid_recs_05['title'].values,
    'score_0_5': hybrid_recs_05['final_score'].values,
    'movie_0_9': hybrid_recs_09['title'].values,
    'score_0_9': hybrid_recs_09['final_score'].values,
})

display(submission)

In [32]:
train_data, test_data = train_test_split(df_merged, test_size=0.2, random_state=42)

print("\nابعاد مجموعه آموزش:", train_data.shape)
print("ابعاد مجموعه آزمون:", test_data.shape)

train_user_movie_matrix = train_data.pivot_table(index='user_id', columns='title', values='rating')
train_user_movie_matrix_normalized = train_user_movie_matrix.apply(lambda x: x - x.mean(), axis=1)
train_user_movie_matrix_normalized_filled = train_user_movie_matrix_normalized.fillna(0)

train_user_similarity_matrix = cosine_similarity(train_user_movie_matrix_normalized_filled)
train_user_similarity_df = pd.DataFrame(train_user_similarity_matrix,
                                        index=train_user_movie_matrix.index,
                                        columns=train_user_movie_matrix.index)

print("\nابعاد ماتریس User-Item آموزش:", train_user_movie_matrix.shape)
print("ابعاد ماتریس شباهت کاربران آموزش:", train_user_similarity_df.shape)

In [33]:
df_movies.shape

In [34]:
df_movies

In [35]:
train_data

In [36]:
def evaluate_recommender(recommender_func, test_df, k=10, min_rating_threshold=4):
    all_users = test_df['user_id'].unique()
    precisions = []

    for user_id in all_users:
        user_test_ratings = test_df[test_df['user_id'] == user_id]
        relevant_items = user_test_ratings[user_test_ratings['rating'] >= min_rating_threshold]['item_id'].tolist()

        if not relevant_items:
            continue

        recommended_items_df = recommender_func(user_id, num_recommendations=k)
        recommended_items = recommended_items_df.index.tolist()

        hits = 0
        for item_id in recommended_items:
            if item_id in relevant_items:
                hits += 1

        if len(recommended_items) > 0:
            precision_at_k = hits / len(recommended_items)
            precisions.append(precision_at_k)

    if precisions:
        return np.mean(precisions)
    else:
        return 0.0

def hybrid_recommender_for_eval(user_id, num_recommendations):
    return get_hybrid_recommendations(user_id, alpha=0.5, num_recommendations=num_recommendations)

mean_precision = evaluate_recommender(hybrid_recommender_for_eval, test_data, k=10)
print(f"\nمیانگین Precision@10 برای سیستم ترکیبی: {mean_precision:.4f}")

alphas_to_test = [0.1, 0.3, 0.5, 0.7, 0.9]
precision_results = []

for alpha_val in alphas_to_test:
    def recommender_alpha_eval(user_id, num_recommendations):
        return get_hybrid_recommendations(user_id, alpha=alpha_val, num_recommendations=num_recommendations)

    precision_val = evaluate_recommender(recommender_alpha_eval, test_data, k=10)
    precision_results.append(precision_val)
    print(f"Precision@10 برای آلفا = {alpha_val}: {precision_val:.4f}")

In [37]:
from tqdm import tqdm

def evaluate_recommender(recommender_func, test_df, k=10, min_rating_threshold=4):
    all_users = test_df['user_id'].unique()
    precisions = []

    for user_id in tqdm(all_users, desc="Evaluating users"):
        user_test_ratings = test_df[test_df['user_id'] == user_id]
        relevant_items = user_test_ratings[user_test_ratings['rating'] >= min_rating_threshold]['item_id'].tolist()

        if not relevant_items:
            continue

        recommended_items_df = recommender_func(user_id, num_recommendations=k)
        recommended_items = recommended_items_df.index.tolist()

        hits = 0
        for item_id in recommended_items:
            if item_id in relevant_items:
                hits += 1

        if len(recommended_items) > 0:
            precision_at_k = hits / len(recommended_items)
            precisions.append(precision_at_k)

    if precisions:
        return np.mean(precisions)
    else:
        return 0.0

def hybrid_recommender_for_eval(user_id, num_recommendations):
    return get_hybrid_recommendations(user_id, alpha=0.5, num_recommendations=num_recommendations)

mean_precision = evaluate_recommender(hybrid_recommender_for_eval, test_data, k=10)
print(f"\nمیانگین Precision@10 برای سیستم ترکیبی: {mean_precision:.4f}")

alphas_to_test = [0.1, 0.3, 0.5, 0.7, 0.9]
precision_results = []

for alpha_val in alphas_to_test:
    def recommender_alpha_eval(user_id, num_recommendations):
        return get_hybrid_recommendations(user_id, alpha=alpha_val, num_recommendations=num_recommendations)

    precision_val = evaluate_recommender(recommender_alpha_eval, test_data, k=10)
    precision_results.append(precision_val)
    print(f"Precision@10 برای آلفا = {alpha_val}: {precision_val:.4f}")

In [38]:
import zipfile
import joblib

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)

file_names = [ 'submission.csv', 'FilmLover_Saver.ipynb']
compress(file_names)

In [39]:
import zipfile
import joblib

if not os.path.exists(os.path.join(os.getcwd(), 'FilmLover_Saver.ipynb')):
    %notebook -e FilmLover_Saver.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)

%notebook 

file_names = [ 'submission.csv', 'FilmLover_Saver.ipynb']
compress(file_names)