In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
import os
import surprise
from surprise import Dataset, Reader
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [25]:
movielens = Dataset.load_builtin('ml-1m', prompt=False)

In [33]:
movies_file = f"{surprise.get_dataset_dir()}/ml-1m/ml-1m/movies.dat"
users_file = f"{surprise.get_dataset_dir()}/ml-1m/ml-1m/users.dat"
ratings_file = f"{surprise.get_dataset_dir()}/ml-1m/ml-1m/ratings.dat"

In [108]:
print(os.path.exists(movies_file))
print(os.path.exists(users_file))
print(os.path.exists(ratings_file))

True
True
True


In [55]:
movies_df = pd.read_csv(movies_file, sep="::", names=["MovieID", "Title", "Genres"], engine='python')
users_df = pd.read_csv(users_file, sep="::", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], engine='python')
ratings_df = pd.read_csv(ratings_file, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], engine='python')

In [56]:
movies_df['Genres'] = movies_df['Genres'].apply(lambda genres: " ".join(genres.split("|")))

In [72]:
movies_df.head(5)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama
4,5,Father of the Bride Part II (1995),Comedy


In [60]:
sum(movies_df['Genres'].isna())

0

In [65]:
tfidf = TfidfVectorizer(max_features=20,analyzer='word')
x = tfidf.fit_transform(movies_df["Genres"])

In [77]:
x.shape

(3883, 20)

In [66]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(x, x)
cosine_sim[:4, :4]

array([[1.        , 0.30552517, 0.19737232, 0.26019351],
       [0.30552517, 1.        , 0.        , 0.        ],
       [0.19737232, 0.        , 1.        , 0.43309256],
       [0.26019351, 0.        , 0.43309256, 1.        ]])

In [81]:
names_to_indices = pd.Series(movies_df.index, index = movies_df['Title'])

In [106]:
def recommend_by_name(name, n=10):
    idx = names_to_indices[name]
    sims = list(enumerate(cosine_sim[idx]))
    sims = sorted(sims,key= lambda x:x[1], reverse=True)
    top_sims = sims[1:n+1]
    top_indicies = [i[0] for i in top_sims]
    return movies_df.iloc[top_indicies]

recommended_films = recommend_by_name("Toy Story (1995)")

print("Because you watched Toy Story (1995), you might also like:")
for idx,row in recommended_films.iterrows():
    print(f"\t{row['Title']}")

Because you watched Toy Story (1995), you might also like:
	Aladdin and the King of Thieves (1996)
	American Tail, An (1986)
	American Tail: Fievel Goes West, An (1991)
	Rugrats Movie, The (1998)
	Bug's Life, A (1998)
	Toy Story 2 (1999)
	Saludos Amigos (1943)
	Chicken Run (2000)
	Adventures of Rocky and Bullwinkle, The (2000)
	Balto (1995)


In [107]:
recommended_films = recommend_by_name("Jumanji (1995)")

print("Because you watched Jumanji (1995), you might also like:")
for idx,row in recommended_films.iterrows():
    print(f"\t{row['Title']}")

Because you watched Jumanji (1995), you might also like:
	Kids of the Round Table (1995)
	Indian in the Cupboard, The (1995)
	NeverEnding Story III, The (1994)
	Escape to Witch Mountain (1975)
	Labyrinth (1986)
	Goonies, The (1985)
	Darby O'Gill and the Little People (1959)
	NeverEnding Story, The (1984)
	NeverEnding Story II: The Next Chapter, The (1990)
	Santa Claus: The Movie (1985)
