In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics.pairwise import cosine_similarity

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import train and test data and save as dataframes
train_df = pd.read_csv('/kaggle/input/edsa-movie-recommender-challenge-2022/train.csv')
test_df = pd.read_csv('/kaggle/input/edsa-movie-recommender-challenge-2022/test.csv')

In [None]:
#drop timestamp column
train_df.drop(['timestamp'], axis = 1, inplace=True)

In [None]:
#sample train data
sample_train_df = train_df.sample(n=40000, random_state=37)

In [None]:
#pivot table
sample_pivot = sample_train_df.pivot_table('rating', index='userId', columns='movieId', fill_value=0.0)

In [None]:
#similarity table
sample_similarity = cosine_similarity(sample_pivot)

In [None]:
#turn similarity table into df

sample_similarity = pd.DataFrame(sample_similarity, index = sample_pivot.index, columns = sample_pivot.index)

# Make a prediction

In [None]:
def predicted_rating(user, movie, N=20):
    
    if user not in sample_similarity.columns and movie not in sample_pivot.columns:
        return round(sample_train_df['rating'].mean()*2)/2
    
    elif user not in sample_similarity.columns:
        return round(sample_train_df[sample_train_df["movieId"] == movie]['rating'].mean()*2)/2
    
    elif movie not in sample_pivot.columns:
        return round(sample_train_df[sample_train_df["userId"] == user]['rating'].mean()*2)/2
    
    else:
        sim_users = pd.DataFrame(sample_similarity.loc[:, user], index = sample_similarity.index)
        sim_users.drop([user])
        sorted_sim_users = sim_users.sort_values(by=user, ascending=False)
        sorted_sim_users = sorted_sim_users.iloc[:N, :]

        movie_ratings = pd.DataFrame(sample_pivot.loc[:,movie], sample_pivot.index)
        user_ratings = sorted_sim_users.join(movie_ratings, how ='inner', on='userId', lsuffix='_l', rsuffix='_r') 
        user_ratings.columns = ['sim_to_user', 'sim_user_rating']
        user_ratings['sim_rating'] = user_ratings['sim_to_user'] * user_ratings['sim_user_rating'] 
        final_rating = user_ratings['sim_rating'].sum() / user_ratings['sim_to_user'].sum()
    
        return round(final_rating*2)/2

In [None]:
rating_list =[]
    
for i in range(len(test_df)):
    user = test_df['userId'][i]
    movie = test_df['movieId'][i]
    rating_list.append(predicted_rating(user, movie, N=20))

In [None]:
submission_df = test_df.copy()
submission_df['rating'] = rating_list

In [None]:
submission_df["Id"] = submission_df["userId"].astype(str) + '_' + submission_df["movieId"].astype(str)

In [None]:
submission_df = submission_df.drop(['userId', 'movieId'], axis = 1)

In [None]:
#write submissions to CSV
submission_df.to_csv('UL_predict_submission_8.csv', index = False)