In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy

movies_columns = ["movieId", "title", "genres"]
ratings_columns = ["userId", "movieId", "rating", "timestamp"]


movies = pd.read_csv("/Users/ivyadiele/Downloads/ml-1m/movies.dat", sep="::", names=movies_columns, engine="python", encoding="ISO-8859-1")
ratings = pd.read_csv("/Users/ivyadiele/Downloads/ml-1m/ratings.dat", sep="::", names=ratings_columns, engine="python")

print(movies.head())
print(ratings.head())

In [34]:
movie_data = pd.merge(ratings, movies, on= "movieId")

movie_data.drop(columns=["timestamp"], inplace=True)

In [14]:
movie_data.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,My Fair Lady (1964),Musical|Romance
3,1,3408,4,Erin Brockovich (2000),Drama
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [20]:
tfidf = TfidfVectorizer(stop_words = "english")
tfidf_matrix = tfidf.fit_transform(movies["genres"].fillna(""))

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [22]:
def recommend_movies(title, num=5):
    idx = movies.loc[movies['title'] == title].index[0]
    scores = list(enumerate(cosine_sim[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:num+1]
    
    recommended_movies = [movies.iloc[i[0]].title for i in sorted_scores]
    return recommended_movies

In [24]:
print(recommend_movies("Toy Story (1995)"))

['Aladdin and the King of Thieves (1996)', 'American Tail, An (1986)', 'American Tail: Fievel Goes West, An (1991)', 'Rugrats Movie, The (1998)', "Bug's Life, A (1998)"]


In [28]:
print(recommend_movies("My Fair Lady (1964)"))

['American in Paris, An (1951)', 'My Fair Lady (1964)', 'Dirty Dancing (1987)', 'West Side Story (1961)', 'Gay Divorcee, The (1934)']


In [38]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(movie_data[['userId', 'movieId', 'rating']], reader)

In [40]:
trainset, testset = train_test_split(data, test_size=0.2)

In [42]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x169770140>

In [50]:
predictions = model.test(testset)

In [52]:
accuracy.rmse(predictions)

RMSE: 0.8730


0.8730312251764153

In [None]:
movie_data.info()

In [None]:
mtotal_movies = movies["movieId"].nunique()
unique_users = ratings["userId"].nunique()
total_ratings = ratings.shape[0]

print(f"Total Movies: {total_movies}")
print(f"Unique Users: {unique_users}")
print(f"Total Ratings: {total_ratings}")