### Description
This is a movie prediction competition, we need to recommend movies for users based on their reviews and information about them

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rec-sys-challenge-task-course-2023/train.csv
/kaggle/input/rec-sys-challenge-task-course-2023/kaggle_baseline.csv


### Load data and split into training and test dataset

In [2]:
df = pd.read_csv('/kaggle/input/rec-sys-challenge-task-course-2023/train.csv')
df.head()

Unnamed: 0,user_id,title,movie_id,rating,release_date,sex,age
0,2592,Top Gun (1986),1101,4,Action|Romance,50,M
1,4318,12 Angry Men (1957),1203,4,Drama,25,M
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,M
3,1706,Modern Times (1936),3462,5,Comedy,25,M
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,F


In [3]:
print(df.columns)

Index(['user_id', 'title', 'movie_id', 'rating', 'release_date', 'sex', 'age'], dtype='object')


In [4]:
print('Maximal rating is ', df['rating'].max())
print('Minimal rating is ', df['rating'].min())

Maximal rating is  5
Minimal rating is  1


As KNNWithMeans algorithm from the Surprise library only considers user-item interactions, we use only columns that reflect these interactions (i.e., user_id, movie_id, and rating).

In [5]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)

### Fit model with train dataset

In [6]:
from surprise import KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy

# Use KNNWithMeans algorithm
sim_options = {
    'name': 'cosine',
    'user_based': False  # Compute similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)

predictions = algo.test(testset)

rmse = accuracy.rmse(predictions)
print(rmse)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9027
0.9026608320438605


### Generate predictions for the test dataset and create a submission file

In [7]:
test = pd.read_csv('/kaggle/input/rec-sys-challenge-task-course-2023/kaggle_baseline.csv')

# Function to get top N recommendations for each user
def get_top_n_recommendations(algo, testset, n=25):
    predictions = algo.test(testset)
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [iid for iid, _ in user_ratings[:n]]
    
    return top_n

testset = [(row['user_id'], movie_id, 4) for index, row in test.iterrows() for movie_id in row['prediction'].split()]
top_n_recommendations = get_top_n_recommendations(algo, testset, n=25)

# Create submission file
with open('submission.csv', 'w', encoding='UTF8') as f:
    f.write('user_id,prediction\n')
    for user_id, recommendations in top_n_recommendations.items():
        f.write(f"{user_id},{' '.join(map(str, recommendations))}\n")