In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162676 sha256=4b40ca14d5e7559be1077784d54e7255f1fe8b387710e49bfb23cc0c71a0861a
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from surprise.model_selection import cross_validate

In [4]:
df = pd.read_csv('Dataset.csv')
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742
...,...,...,...,...
99998,880,476,3,880175444
99999,716,204,5,879795543
100000,276,1090,1,874795795
100001,13,225,2,882399156


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100003 non-null  int64
 1   item_id    100003 non-null  int64
 2   rating     100003 non-null  int64
 3   timestamp  100003 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [6]:
df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100003.0,100003.0,100003.0,100003.0
mean,462.470876,425.520914,3.529864,883528800.0
std,266.622454,330.797791,1.125704,5343791.0
min,0.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [7]:
df.isnull().sum() #no null value

user_id      0
item_id      0
rating       0
timestamp    0
dtype: int64

In [9]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

In [10]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)


In [11]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f11fa2b6530>

In [12]:
predictions = model.test(testset)

In [13]:
accuracy = rmse(predictions)
print(f"RMSE: {accuracy:.2f}")


RMSE: 0.9351
RMSE: 0.94


In [18]:
user_id_to_recommend = 3

In [19]:
movies_not_watched = df[df['user_id'] == user_id_to_recommend]['item_id'].unique()
movies_to_recommend = df[~df['item_id'].isin(movies_not_watched)]['item_id'].unique()


In [20]:
predicted_ratings = []
for movie_id in movies_to_recommend:
    predicted_rating = model.predict(user_id_to_recommend, movie_id).est
    predicted_ratings.append({'user_id': user_id_to_recommend, 'item_id': movie_id, 'predicted_rating': predicted_rating})

In [21]:
top_n = 5
recommended_movies = pd.DataFrame(predicted_ratings).nlargest(top_n, 'predicted_rating')
print(f"\nTop {top_n} Movie Recommendations for User {user_id_to_recommend}:\n")
print(recommended_movies[['item_id', 'predicted_rating']])


Top 5 Movie Recommendations for User 3:

     item_id  predicted_rating
536      963          4.357992
48       100          4.278762
158      483          4.258158
635      641          4.242174
195       56          4.143223
