In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
import os
import surprise
from surprise import Dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [25]:
movielens = Dataset.load_builtin('ml-1m', prompt=False)

In [33]:
movies_file = f"{surprise.get_dataset_dir()}/ml-1m/ml-1m/movies.dat"
users_file = f"{surprise.get_dataset_dir()}/ml-1m/ml-1m/users.dat"
ratings_file = f"{surprise.get_dataset_dir()}/ml-1m/ml-1m/ratings.dat"

In [108]:
print(os.path.exists(movies_file))
print(os.path.exists(users_file))
print(os.path.exists(ratings_file))

True
True
True


In [55]:
movies_df = pd.read_csv(movies_file, sep="::", names=["MovieID", "Title", "Genres"], engine='python')
users_df = pd.read_csv(users_file, sep="::", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], engine='python')
ratings_df = pd.read_csv(ratings_file, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], engine='python')

Genres-based recommendation

In [56]:
movies_df['Genres'] = movies_df['Genres'].apply(lambda genres: " ".join(genres.split("|")))

In [72]:
movies_df.head(5)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama
4,5,Father of the Bride Part II (1995),Comedy


In [60]:
sum(movies_df['Genres'].isna())

0

In [114]:
tfidf = TfidfVectorizer(max_features=20,analyzer='word')
tfidf_matrix = tfidf.fit_transform(movies_df["Genres"])
print(tfidf_matrix.shape)

(3883, 20)


In [115]:
cv = CountVectorizer(max_features=20, analyzer='word')
cv_matrix = cv.fit_transform(movies_df["Genres"]).toarray()
print(cv_matrix.shape)

(3883, 20)


In [116]:
cosine_sim_tf_idf = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim_tf_idf[:4, :4])

cosine_sim_cv = linear_kernel(cv_matrix, cv_matrix)
print(cosine_sim_cv[:4, :4])

names_to_indices = pd.Series(movies_df.index, index = movies_df['Title'])

[[1.         0.30552517 0.19737232 0.26019351]
 [0.30552517 1.         0.         0.        ]
 [0.19737232 0.         1.         0.43309256]
 [0.26019351 0.         0.43309256 1.        ]]
[[3. 1. 1. 1.]
 [1. 3. 0. 0.]
 [1. 0. 2. 1.]
 [1. 0. 1. 2.]]


In [120]:
def recommend_by_name(name, sim_matrix, n=10):
    idx = names_to_indices[name]
    sims = list(enumerate(sim_matrix[idx]))
    sims = sorted(sims,key= lambda x:x[1], reverse=True)
    top_sims = sims[1:n+1]
    top_indicies = [i[0] for i in top_sims]
    return movies_df.iloc[top_indicies]

recommended_films = recommend_by_name("Toy Story (1995)", cosine_sim_cv)

print("Because you watched Toy Story (1995), you might also like:")
for idx,row in recommended_films.iterrows():
    print(f"\t{row['Title']}")

print()
recommended_films = recommend_by_name("Toy Story (1995)", cosine_sim_tf_idf)

print("Because you watched Toy Story (1995), you might also like:")
for idx,row in recommended_films.iterrows():
    print(f"\t{row['Title']}")


Because you watched Toy Story (1995), you might also like:
	Goofy Movie, A (1995)
	Aladdin (1992)
	Space Jam (1996)
	Aladdin and the King of Thieves (1996)
	Hercules (1997)
	Jungle Book, The (1967)
	Lady and the Tramp (1955)
	Little Mermaid, The (1989)
	Steamboat Willie (1940)
	American Tail, An (1986)

Because you watched Toy Story (1995), you might also like:
	Aladdin and the King of Thieves (1996)
	American Tail, An (1986)
	American Tail: Fievel Goes West, An (1991)
	Rugrats Movie, The (1998)
	Bug's Life, A (1998)
	Toy Story 2 (1999)
	Saludos Amigos (1943)
	Chicken Run (2000)
	Adventures of Rocky and Bullwinkle, The (2000)
	Balto (1995)


Genres+Title-based recommendation


In [123]:
movies_df['GT'] = movies_df["Title"]+" "+movies_df["Genres"]

In [134]:
tfidf = TfidfVectorizer(max_features=400,analyzer='word')
tfidf_matrix = tfidf.fit_transform(movies_df["GT"])
print(tfidf_matrix.shape)

(3883, 400)


In [135]:
cv = CountVectorizer(max_features=30, analyzer='word')
cv_matrix = cv.fit_transform(movies_df["GT"]).toarray()
print(cv_matrix.shape)

(3883, 30)


In [136]:
cosine_sim_tf_idf = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim_tf_idf[:4, :4])

cosine_sim_cv = linear_kernel(cv_matrix, cv_matrix)
print(cosine_sim_cv[:4, :4])

names_to_indices = pd.Series(movies_df.index, index = movies_df['Title'])

[[1.         0.34439139 0.16729851 0.27076221]
 [0.34439139 1.         0.13804774 0.22342166]
 [0.16729851 0.13804774 1.         0.23454174]
 [0.27076221 0.22342166 0.23454174 1.        ]]
[[4. 2. 2. 2.]
 [2. 3. 1. 1.]
 [2. 1. 3. 2.]
 [2. 1. 2. 3.]]


In [137]:
def recommend_by_name(name, sim_matrix, n=10):
    idx = names_to_indices[name]
    sims = list(enumerate(sim_matrix[idx]))
    sims = sorted(sims,key= lambda x:x[1], reverse=True)
    top_sims = sims[1:n+1]
    top_indicies = [i[0] for i in top_sims]
    return movies_df.iloc[top_indicies]

recommended_films = recommend_by_name("Toy Story (1995)", cosine_sim_cv)

print("Because you watched Toy Story (1995), you might also like:")
for idx,row in recommended_films.iterrows():
    print(f"\t{row['Title']}")

print()
recommended_films = recommend_by_name("Toy Story (1995)", cosine_sim_tf_idf)

print("Because you watched Toy Story (1995), you might also like:")
for idx,row in recommended_films.iterrows():
    print(f"\t{row['Title']}")

Because you watched Toy Story (1995), you might also like:
	Goofy Movie, A (1995)
	Balto (1995)
	Babe (1995)
	Pocahontas (1995)
	Big Green, The (1995)
	Gumby: The Movie (1995)
	Kid in King Arthur's Court, A (1995)
	Aladdin (1992)
	Space Jam (1996)
	Close Shave, A (1995)

Because you watched Toy Story (1995), you might also like:
	Toy Story 2 (1999)
	Balto (1995)
	Lilian's Story (1995)
	Close Shave, A (1995)
	Pyromaniac's Love Story, A (1995)
	Pocahontas (1995)
	Babe (1995)
	Goofy Movie, A (1995)
	We're Back! A Dinosaur's Story (1993)
	Mulan (1998)
