In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from IPython.display import display 

# Import Data

In [2]:
df_movies = pd.read_csv("assets/movies.csv", usecols = ["movieId", "title"])
df_ratings = pd.read_csv("assets/ratings.csv", usecols = ["userId", "movieId", "rating"])

print(f"Movies dataframe: ({len(df_movies):,} rows)")
display(df_movies.head())

print(f"Ratings dataframe: ({len(df_ratings):,} rows)")
display(df_ratings.head())

Movies dataframe: (58,098 rows)


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


Ratings dataframe: (27,753,444 rows)


Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


# Create Sparse Matrix

In [3]:
# in part credit to Kevy / ChatGPT

# but fixed a logical error of index mismatch by including categories from df_movies
movies_cat = pd.Categorical(df_ratings['movieId'], categories=df_movies['movieId'])
users_cat = pd.Categorical(df_ratings['userId'])

# create sparse matrix
mat_movies_users = csr_matrix((df_ratings['rating'], (movies_cat.codes, users_cat.codes)))

mat_movies_users

<58098x283228 sparse matrix of type '<class 'numpy.float64'>'
	with 27753444 stored elements in Compressed Sparse Row format>

# Set up Model

In [4]:
# cosine simularity as distance metric, and brute force to go through all vectors
model_KNN = NearestNeighbors(metric = "cosine", algorithm = "brute")
model_KNN.fit(mat_movies_users) # fitting to sparse matrix

# Test function for index mismatch search

In [5]:
def test(indices):
    """
    test that indices align properly, insert any iterable of multiple indices to check
    """
    for idx in indices:
        id = df_movies.loc[idx]["movieId"]
        sum_dataframe = df_ratings[df_ratings["movieId"] == id]["rating"].sum()
        sum_matrix = mat_movies_users[idx].sum()

        errors = 0

        if sum_dataframe != sum_matrix:
            print(f"Comparing index {idx}, result: {sum_dataframe == sum_matrix} ({sum_dataframe, sum_matrix})")
            errors += 1
        
    if errors == 0:
        print("All indices tested with no mismatches")
    print(f"{errors} errors found")

Loop through indices to see if there are any mismatches

In [6]:
indices = [i for i in range(9700, 9742)]
test(indices)

All indices tested with no mismatches
0 errors found


# Recommendation Function

In [7]:

def recommender(movie_name, model = model_KNN, data = mat_movies_users, n_recommendations = 9):
    idx = process.extractOne(movie_name, df_movies["title"])[2]

    distances, indices = model.kneighbors(data[idx], n_neighbors = n_recommendations + 1)

    indices = indices.flatten()[1:]
    
    # print results:
    print(f"Movie Selected: \"{df_movies.loc[idx]['title']}\"\n") # selected movie title
    for a, i in enumerate(indices): # looping through indices:
        print(f"{a + 1}. {df_movies.loc[i]['title']}") # print each title in order from closest to farthest

# Enter a movie to get recommendations

### Trying a few different ones below as example:

In [8]:
recommender("pinocchio")

Movie Selected: "Pinocchio (1940)"

1. Snow White and the Seven Dwarfs (1937)
2. Dumbo (1941)
3. Cinderella (1950)
4. Bambi (1942)
5. Alice in Wonderland (1951)
6. Fantasia (1940)
7. Peter Pan (1953)
8. Mary Poppins (1964)
9. Sleeping Beauty (1959)


In [9]:
recommender("star wars")

Movie Selected: "Star Wars: Episode IV - A New Hope (1977)"

1. Star Wars: Episode V - The Empire Strikes Back (1980)
2. Star Wars: Episode VI - Return of the Jedi (1983)
3. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
4. Matrix, The (1999)
5. Indiana Jones and the Last Crusade (1989)
6. Back to the Future (1985)
7. Terminator, The (1984)
8. Alien (1979)
9. Toy Story (1995)


In [10]:
recommender("iron man 3")

Movie Selected: "Iron Man 3 (2013)"

1. Iron Man 2 (2010)
2. Captain America: The Winter Soldier (2014)
3. Captain America: The First Avenger (2011)
4. Avengers: Age of Ultron (2015)
5. Avengers, The (2012)
6. Thor: The Dark World (2013)
7. Thor (2011)
8. X-Men: Days of Future Past (2014)
9. X-Men: First Class (2011)


In [11]:
recommender("shrek 2")

Movie Selected: "Shrek 2 (2004)"

1. Shrek (2001)
2. Finding Nemo (2003)
3. Incredibles, The (2004)
4. Ice Age (2002)
5. Monsters, Inc. (2001)
6. Spider-Man 2 (2004)
7. Pirates of the Caribbean: The Curse of the Black Pearl (2003)
8. Spider-Man (2002)
9. Harry Potter and the Prisoner of Azkaban (2004)


In [12]:
recommender("amityville")

Movie Selected: "Amityville 1992: It's About Time (1992)"

1. Amityville: A New Generation (1993)
2. Amityville: Dollhouse (1996)
3. Amityville Curse, The (1990)
4. Amityville 3-D (1983)
5. Amityville II: The Possession (1982)
6. Children of the Corn II: The Final Sacrifice (1993)
7. Children of the Corn III (1994)
8. Body Parts (1991)
9. Gate II: Trespassers, The (1990)


In [14]:
recommender("bill burr")

Movie Selected: "Bill Burr: I'm Sorry You Feel That Way (2014)"

1. Bill Burr: You People Are All the Same (2012)
2. Bill Burr: Why Do I Do This? (2008)
3. Bill Burr: Let It Go (2010)
4. Bill Burr: Walk Your Way Out (2017)
5. Jim Jefferies: BARE (2014)
6. Louis C.K.: Oh My God (2013)
7. Jim Jefferies: Freedumb (2016)
8. Louis C.K.: One Night Stand (2005)
9. Louis C.K.: Chewed Up (2008)


In [15]:
recommender("sharknado")

Movie Selected: "Sharknado (2013)"

1. Sharknado 2: The Second One (2014)
2. Sharknado 3: Oh Hell No! (2015)
3. Sharknado 4: The 4th Awakens (2016)
4. Lavalantula (2015)
5. Zombeavers (2014)
6. 2-Headed Shark Attack (2012)
7. Piranhaconda (2012)
8. Evil Dead (2013)
9. Piranha (Piranha 3D) (2010)
