In [1]:
import pandas as pd
import numpy as np
import random 
from random import randint

In [2]:
r_cols = ["user_id", "movie_id", "rating", "unix_timestamp"]
ratings = pd.read_csv(".\\ml-100k\\u1.base", sep='\t', names=r_cols, encoding='latin-1')
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv(".\\ml-100k\\u.item", sep='|', names=m_cols, usecols=range(5),encoding='latin-1')
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('.\\ml-100k\\u.user', sep='|', names=u_cols, encoding='latin-1')
movie_ratings = pd.merge(movies, ratings, on='movie_id', how='inner')
movie_stats = movie_ratings.groupby('movie_id', as_index=False)['rating'].mean()
ratings_sorted = movie_stats.sort_values('rating', ascending=False)
movielens = pd.merge(movie_ratings, users, on='user_id')
ratings_by_gender = movielens.pivot_table('rating',index=['movie_id'], columns='gender', aggfunc='mean')
female_top_ratings = ratings_by_gender.sort_values('F', ascending=False)
male_top_ratings = ratings_by_gender.sort_values('M', ascending=False)

In [4]:
movie_stats0 = movie_ratings.groupby('movie_id', as_index=False)['rating'].mean()
movie_stats1 = movie_ratings.groupby('movie_id', as_index=False)['rating'].count()
movie_stats1.columns = ['movie_id','num_of_ratings']
movie_pop = pd.merge(movie_stats0, movie_stats1, on='movie_id')
movie_pop['popularity']=(movie_pop['rating']*movie_pop['num_of_ratings'])/(users.shape[0]*5)
sorted_by_pop = movie_pop.sort_values('popularity', ascending=False)
popularity_with_names = pd.merge(sorted_by_pop,movies,on='movie_id',how='inner')


In [5]:
r_cols = ["user_id", "movie_id", "rating", "unix_timestamp"]
ratings_test = pd.read_csv(".\\ml-100k\\u1.test", sep='\t', names=r_cols, encoding='latin-1')
movie_stats_test = ratings_test.groupby('movie_id', as_index=False)['rating'].mean()
train_and_test_results = pd.merge(movie_stats_test, popularity_with_names, on='movie_id', how='inner')

In [6]:
MAE = np.sum(abs(train_and_test_results['rating_x']-train_and_test_results['rating_y']))/train_and_test_results.shape[0]
print("MAE = "+ str(MAE))

MAE = 0.4740090531709019


In [7]:
movies_user_saw_train = (ratings.groupby('user_id')['movie_id'].apply(list)).to_dict()
list_recommended_by_rank = ratings_sorted['movie_id'].tolist()
recommend_by_rating = pd.DataFrame(columns=["user_id","movie_id"])
recommend_by_random = pd.DataFrame(columns=["user_id","movie_id"])
counter_by_rating = 0
counter_by_random = 0
for user in ratings_test['user_id'].unique():
    num_of_mov_rec = 0
    for r_movie in list_recommended_by_rank:
        if not movies_user_saw_train[user].__contains__(r_movie):
            recommend_by_rating.loc[counter_by_rating] = [user, r_movie]
            counter_by_rating += 1
            num_of_mov_rec += 1
            if num_of_mov_rec == 20:
                break
    num_of_mov_random = 0
    while num_of_mov_random != 20:
        rand_movie = randint(1, movies.shape[0])
        if not movies_user_saw_train[user].__contains__(rand_movie):
            recommend_by_random.loc[counter_by_random] = [user, rand_movie]
            counter_by_random += 1
            num_of_mov_random += 1

In [8]:
movies_user_saw_test = (ratings_test.groupby('user_id')['movie_id'].apply(list)).to_dict()
movies_user_rec = (recommend_by_rating.groupby('user_id')['movie_id'].apply(list)).to_dict()
movies_user_rand = (recommend_by_random.groupby('user_id')['movie_id'].apply(list)).to_dict()
sum_common_rec = 0
sum_common_rand = 0
sum_movies_saw = 0
for user in ratings_test['user_id'].unique():
    sum_common_rec += len(set(movies_user_saw_test[user]) & set(movies_user_rec[user]))
    sum_common_rand += len(set(movies_user_saw_test[user]) & set(movies_user_rand[user]))
    sum_movies_saw += len((set(movies_user_saw_test[user])))
recall_rec = sum_common_rec / sum_movies_saw
recall_rand = sum_common_rand / sum_movies_saw
num_of_rec = len(ratings_test['user_id'].unique())
prec_rec = sum_common_rec / (num_of_rec*20)
prec_rand = sum_common_rand / (num_of_rec*20)
print("Recall (by recommendation): " + str(round(recall_rec,3)))
print("Recall (by random): " + str(round(recall_rand,3)))
print("Precision (by recommendation): "+str(round(prec_rec,3)))
print("Precision (by random): "+str(round(prec_rand,3)))

Recall (by recommendation): 0.021
Recall (by random): 0.013
Precision (by recommendation): 0.046
Precision (by random): 0.028


In [9]:
full_data_test = pd.merge(ratings_test, users, on='user_id')
female_rows = full_data_test['gender'] == 'F'
female_data = full_data_test[female_rows]
female_ratings = pd.DataFrame(female_top_ratings.to_records())
list_recommended_by_rank_female = female_ratings['movie_id'].tolist()
recommend_by_rating_female = pd.DataFrame(columns=["user_id","movie_id"])
recommend_by_random_female = pd.DataFrame(columns=["user_id","movie_id"])
counter_by_rating_female = 0
counter_by_random_female = 0
for user in female_data['user_id'].unique():
    num_of_mov_rec_female = 0
    for r_movie in list_recommended_by_rank_female:
        if not movies_user_saw_train[user].__contains__(r_movie):
            recommend_by_rating_female.loc[counter_by_rating_female] = [user, r_movie]
            counter_by_rating_female += 1
            num_of_mov_rec_female += 1
            if num_of_mov_rec_female == 20:
                break
    num_of_mov_random_female = 0
    while num_of_mov_random_female != 20:
        rand_movie = random.choice(list_recommended_by_rank_female)
        if not movies_user_saw_train[user].__contains__(rand_movie):
            recommend_by_random_female.loc[counter_by_random_female] = [user, rand_movie]
            counter_by_random_female += 1
            num_of_mov_random_female += 1

In [10]:
movies_female_rec = (recommend_by_rating_female.groupby('user_id')['movie_id'].apply(list)).to_dict()
movies_female_rand = (recommend_by_random_female.groupby('user_id')['movie_id'].apply(list)).to_dict()
sum_common_rec_female = 0
sum_common_rand_female = 0
sum_movies_saw_female = 0
for user in female_data['user_id'].unique():
    sum_common_rec_female += len(set(movies_user_saw_test[user]) & set(movies_female_rec[user]))
    sum_common_rand_female += len(set(movies_user_saw_test[user]) & set(movies_female_rand[user]))
    sum_movies_saw_female += len((set(movies_user_saw_test[user])))
recall_rec_female = sum_common_rec_female / sum_movies_saw_female
recall_rand_female = sum_common_rand_female / sum_movies_saw_female
num_of_rec_female = len(female_data['user_id'].unique())
prec_rec_female = sum_common_rec_female / (num_of_rec_female*20)
prec_rand_female = sum_common_rand_female / (num_of_rec_female*20)
print("Recall (by recommendation) for females: " + str(round(recall_rec_female,3)))
print("Recall (by random) for females: " + str(round(recall_rand_female,3)))
print("Precision (by recommendation) for females: "+str(round(prec_rec_female,3)))
print("Precision (by random) for females: "+str(round(prec_rand_female,3)))

Recall (by recommendation) for females: 0.01
Recall (by random) for females: 0.014
Precision (by recommendation) for females: 0.019
Precision (by random) for females: 0.026


In [11]:
male_rows = full_data_test['gender'] == 'M'
male_data = full_data_test[male_rows]
male_ratings = pd.DataFrame(male_top_ratings.to_records())
list_recommended_by_rank_male = male_ratings['movie_id'].tolist()
recommend_by_rating_male = pd.DataFrame(columns=["user_id", "movie_id"])
recommend_by_random_male = pd.DataFrame(columns=["user_id", "movie_id"])
counter_by_rating_male = 0
counter_by_random_male = 0
for user in male_data['user_id'].unique():
    num_of_mov_rec_male = 0
    for r_movie in list_recommended_by_rank_male:
        if not movies_user_saw_train[user].__contains__(r_movie):
            recommend_by_rating_male.loc[counter_by_rating_male] = [user, r_movie]
            counter_by_rating_male += 1
            num_of_mov_rec_male += 1
            if num_of_mov_rec_male == 20:
                break
    num_of_mov_random_male = 0
    while num_of_mov_random_male != 20:
        rand_movie = random.choice(list_recommended_by_rank_male)
        if not movies_user_saw_train[user].__contains__(rand_movie):
            recommend_by_random_male.loc[counter_by_random_male] = [user, rand_movie]
            counter_by_random_male += 1
            num_of_mov_random_male += 1

In [12]:
movies_male_rec = (recommend_by_rating_male.groupby('user_id')['movie_id'].apply(list)).to_dict()
movies_male_rand = (recommend_by_random_male.groupby('user_id')['movie_id'].apply(list)).to_dict()
sum_common_rec_male = 0
sum_common_rand_male = 0
sum_movies_saw_male = 0
for user in male_data['user_id'].unique():
    sum_common_rec_male += len(set(movies_user_saw_test[user]) & set(movies_male_rec[user]))
    sum_common_rand_male += len(set(movies_user_saw_test[user]) & set(movies_male_rand[user]))
    sum_movies_saw_male += len((set(movies_user_saw_test[user])))
recall_rec_male = sum_common_rec_male / sum_movies_saw_male
recall_rand_male = sum_common_rand_male / sum_movies_saw_male
num_of_rec_male = len(male_data['user_id'].unique())
prec_rec_male = sum_common_rec_male / (num_of_rec_male*20)
prec_rand_male = sum_common_rand_male / (num_of_rec_male*20)
print("Recall (by recommendation) for males: " + str(round(recall_rec_male,3)))
print("Recall (by random) for males: " + str(round(recall_rand_male,3)))
print("Precision (by recommendation) for males: "+str(round(prec_rec_male,3)))
print("Precision (by random) for males: "+str(round(prec_rand_male,3)))

Recall (by recommendation) for males: 0.003
Recall (by random) for males: 0.013
Precision (by recommendation) for males: 0.006
Precision (by random) for males: 0.03
