In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xgboost as xgb
from scipy import sparse
from sklearn.metrics import mean_squared_error

from sklearn.metrics.pairwise import cosine_similarity

from surprise import BaselineOnly, Dataset, KNNBaseline, Reader, SlopeOne, SVD, SVDpp
from surprise.model_selection import GridSearchCV


In [None]:
# Load the datasets

movies = pd.read_csv("ml-20m/movies.csv")
ratings = pd.read_csv("ml-20m/ratings.csv")


In [None]:
print("### Movies ###")
print(movies.info())
print(movies.head())


In [None]:
print("### Ratings ###")
print(ratings.info())
print(ratings.head())


In [None]:
# Basic statistics

print(ratings.describe())
# Histogram of ratings
ratings["rating"].hist(bins=30)
plt.title("Distribution of Movie Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()
# Number of ratings per movie
ratings_per_movie = ratings.groupby("movieId").count()["rating"]
ratings_per_movie.hist(bins=50)
plt.title("Number of Ratings per Movie")
plt.xlabel("Number of Ratings")
plt.ylabel("Count")
plt.show()


In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Load the data into Surprise format


reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
# Use SVD for matrix factorization


svd = SVD()
# Cross-validation to evaluate the algorithm


cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)


In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)
