# Movie Recommender System Analysis

This notebook explores the MovieLens dataset and builds components for a recommendation system.


## Setup: Import Libraries


In [2]:
import pandas as pd
import numpy as np

## Part 1: Load and Explore Ratings Data

The `u.data` file contains user ratings with columns: user_id, item_id (movie_id), rating, timestamp


In [3]:
# Read the u.data file
ratings_df = pd.read_csv(
    "data/u.data",
    sep="\t",
    names=["user_id", "item_id", "rating", "timestamp"],
    engine="python",
)

print(f"Dataset shape: {ratings_df.shape}")
ratings_df.head()

Dataset shape: (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# Display dataset info
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [5]:
# Basic statistics
ratings_df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


## Part 2: Load Genre Information

The `u.genre` file contains the list of all movie genres.


In [6]:
# Read genre information
genres_df = pd.read_csv(
    "data/u.genre", sep="|", names=["genre", "genre_id"], encoding="latin-1"
)
genres_df = genres_df.dropna()
genre_names = genres_df["genre"].tolist()

print(f"Total genres: {len(genre_names)}")
print(f"Genres: {genre_names}")

Total genres: 19
Genres: ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [7]:
genres_df.head()

Unnamed: 0,genre,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


## Part 3: Load Movies with Genre Vectors

The `u.item` file contains movie information including binary multi-hot vectors for genres.


In [10]:
# Read movie information
column_names = [
    "movie_id",
    "movie_title",
    "release_date",
    "video_release_date",
    "imdb_url",
] + genre_names

movies_df = pd.read_csv("data/u.item", sep="|", names=column_names, encoding="latin-1")

print(f"Total movies: {len(movies_df)}")
movies_df.head()

Total movies: 1682


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Part 4: Extract Genre Vectors (Multi-Hot Encoding)

Each movie has a binary vector where 1 indicates the movie belongs to that genre.


In [20]:
# Extract movie ID, title, and genre vectors
genre_vectors = movies_df[["movie_id", "movie_title"] + genre_names]
genre_vectors.head(10)

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Part 5: Genre Statistics


In [12]:
# Count movies per genre
genre_counts = movies_df[genre_names].sum().sort_values(ascending=False)
print("Number of movies per genre:")
print(genre_counts)

Number of movies per genre:
Drama          725
Comedy         505
Action         251
Thriller       251
Romance        247
Adventure      135
Children's     122
Crime          109
Sci-Fi         101
Horror          92
War             71
Mystery         61
Musical         56
Documentary     50
Animation       42
Western         27
Film-Noir       24
Fantasy         22
unknown          2
dtype: int64


In [14]:
# Calculate average number of genres per movie
avg_genres_per_movie = movies_df[genre_names].sum(axis=1).mean()
print(f"Average number of genres per movie: {avg_genres_per_movie:.2f}")

Average number of genres per movie: 1.72


## Part 6: Save Genre Vectors


In [None]:
# Save genre vectors to CSV
genre_vectors.to_csv("movie_genre_vectors.csv", index=False)
print("Genre vectors saved to 'movie_genre_vectors.csv'")

## Part 7: Explore Movies by Genre

Example: Find all Sci-Fi movies


In [15]:
# Show movies with Sci-Fi genre
scifi_movies = movies_df[movies_df["Sci-Fi"] == 1][
    ["movie_id", "movie_title"] + genre_names
].head(10)
scifi_movies

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
6,7,Twelve Monkeys (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
37,38,"Net, The (1995)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
38,39,Strange Days (1995),0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
49,50,Star Wars (1977),0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
61,62,Stargate (1994),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
81,82,Jurassic Park (1993),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
83,84,Robert A. Heinlein's The Puppet Masters (1994),0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
88,89,Blade Runner (1982),0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
95,96,Terminator 2: Judgment Day (1991),0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
100,101,Heavy Metal (1981),0,1,1,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


## Part 8: Your Experiments Below

Use the cells below to test new code as you learn new topics.


In [23]:
# Your code here

query = genre_vectors.sample(n=1, random_state=np.random.randint(0, 10000)).iloc[0]
print(f"Randomly selected genre: {query}")


def compute_genre_similarity(query_vector, genre_vectors):
    query_genres = query_vector[genre_names].values
    similarities = []
    for _, row in genre_vectors.iterrows():
        movie_genres = row[genre_names].values
        similarity = np.dot(query_genres, movie_genres) / (
            np.linalg.norm(query_genres) * np.linalg.norm(movie_genres) + 1e-10
        )
        similarities.append((row["movie_id"], row["movie_title"], similarity))
    return sorted(similarities, key=lambda x: x[2], reverse=True)


similar_movies = compute_genre_similarity(query, genre_vectors)
print("Top 5 similar movies based on genre:")
for movie_id, movie_title, sim in similar_movies[1:6]:
    print(
        f"Movie ID: {movie_id}, Title: {movie_title}, Similarity: {sim:.4f}, Genres: {movies_df[movies_df['movie_id'] == movie_id][genre_names].values}"
    )

Randomly selected genre: movie_id                       1091
movie_title    Pete's Dragon (1977)
unknown                           0
Action                            0
Adventure                         1
Animation                         1
Children's                        1
Comedy                            0
Crime                             0
Documentary                       0
Drama                             0
Fantasy                           0
Film-Noir                         0
Horror                            0
Musical                           1
Mystery                           0
Romance                           0
Sci-Fi                            0
Thriller                          0
War                               0
Western                           0
Name: 1090, dtype: object
Top 5 similar movies based on genre:
Movie ID: 993, Title: Hercules (1997), Similarity: 0.8944, Genres: [[0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0]]
Movie ID: 71, Title: Lion King, The (1994), Sim

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

genre_matrix = genre_vectors[genre_names].values
print(genre_matrix)

cosine_sim = cosine_similarity(genre_matrix)
print(cosine_sim)


[[0 0 0 ... 0 0 0]
 [0 1 1 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[1.         0.         0.         ... 0.         0.57735027 0.        ]
 [0.         1.         0.57735027 ... 0.         0.         0.        ]
 [0.         0.57735027 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.70710678]
 [0.57735027 0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.70710678 0.         1.        ]]


In [32]:
x = [[1, 1, 1], [0, 1, 0]]
y = [0, 1, 0]
sim = cosine_similarity(x, [y])
print(sim)


[[0.57735027]
 [1.        ]]


In [30]:
genre_vectors.head()

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [35]:
genre_vectors.iloc[0].values

array([np.int64(1), 'Toy Story (1995)', np.int64(0), np.int64(0),
       np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(0),
       np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0),
       np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0),
       np.int64(0), np.int64(0)], dtype=object)

In [44]:
query = genre_vectors.iloc[0]
# print(query)

sim = cosine_similarity([query[genre_names].values], genre_vectors[genre_names].values)
sim_scores = sim.ravel()
print("*" * 10)
print(sim, "sim")
print(sim_scores, "sim_scores")
print("*" * 10)
query_idx = query.name if hasattr(query, "name") else None

# sort indices by similarity (descending) and exclude the query itself
sorted_idx = np.argsort(sim_scores)[::-1]
if query_idx is not None:
    sorted_idx = sorted_idx[sorted_idx != query_idx]

top_n = 10
top_idx = sorted_idx[:top_n]

for rank, idx in enumerate(top_idx, start=1):
    row = genre_vectors.iloc[idx]
    print(
        f"{rank}. {row['movie_title']} (movie_id: {row['movie_id']}) — similarity: {sim_scores[idx]:.4f}"
    )

**********
[[1.         0.         0.         ... 0.         0.57735027 0.        ]] sim
[1.         0.         0.         ... 0.         0.57735027 0.        ] sim_scores
**********
1. Aladdin and the King of Thieves (1996) (movie_id: 422) — similarity: 1.0000
2. Aladdin (1992) (movie_id: 95) — similarity: 0.8660
3. Goofy Movie, A (1995) (movie_id: 1219) — similarity: 0.8660
4. Jungle2Jungle (1997) (movie_id: 243) — similarity: 0.8165
5. Air Bud (1997) (movie_id: 261) — similarity: 0.8165
6. George of the Jungle (1997) (movie_id: 259) — similarity: 0.8165
7. Santa Clause, The (1994) (movie_id: 63) — similarity: 0.8165
8. Little Big League (1994) (movie_id: 1032) — similarity: 0.8165
9. Heavyweights (1994) (movie_id: 377) — similarity: 0.8165
10. Flintstones, The (1994) (movie_id: 383) — similarity: 0.8165


In [42]:
## movie by movie cos sim matrix

cosine_sim = cosine_similarity(genre_vectors[genre_names].values)
cosine_sim
genre_vectors[genre_names].values


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(1682, 19))

In [45]:
def recommend_movie(movie_id: int, k: int = 10):
    """
    Return top-k most similar movies to `movie_id` based on the precomputed `cosine_sim`.
    Returns a DataFrame with columns: movie_id, movie_title, similarity.
    """
    matches = genre_vectors.index[genre_vectors["movie_id"] == movie_id].tolist()
    if not matches:
        raise ValueError(f"movie_id {movie_id} not found")
    idx = matches[0]

    sim_row = cosine_sim[idx].copy()
    sim_row[idx] = -1.0  # exclude the movie itself

    k = min(k, len(sim_row) - 1)
    top_idx = np.argsort(sim_row)[::-1][:k]

    res = genre_vectors.loc[top_idx, ["movie_id", "movie_title"]].copy()
    res["similarity"] = sim_row[top_idx]
    return res.reset_index(drop=True)


# Example usage:
recommend_movie(1, 5)


Unnamed: 0,movie_id,movie_title,similarity
0,422,Aladdin and the King of Thieves (1996),1.0
1,1219,"Goofy Movie, A (1995)",0.866025
2,95,Aladdin (1992),0.866025
3,261,Air Bud (1997),0.816497
4,404,Pinocchio (1940),0.816497


In [None]:
def content_filter(movie_id: int, k: int = 10, recompute_genres: bool = True):
    """
    Build genre vectors and similarity matrix, update globals, then call
    `recommend_movie()` to return top-k content-based recommendations.
    """
    global genre_vectors, cosine_sim

    if recompute_genres:
        # build (or rebuild) the genre vectors from movies_df
        genre_vectors = movies_df[["movie_id", "movie_title"] + genre_names].copy()

    # compute cosine similarity matrix from genre multi-hot vectors
    genre_matrix = genre_vectors[genre_names].values
    cosine_sim = cosine_similarity(genre_matrix)

    # delegate to existing recommend_movie function
    return recommend_movie(movie_id, k)


# Example usage:
# content_filter(1, 5)

# User Based Collaborative Filtering

## Part 1: Create User-Item Matrix


In [46]:
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [47]:
user_item_matrix = ratings_df.pivot(
    index="user_id", columns="item_id", values="rating"
).fillna(0)
user_item_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Part 2: Compute User Similarity Matrix


In [65]:
## pick 2 users who rated same movies
## consider only the movies that both have rated

user1 = user_item_matrix.iloc[0].values
user2 = user_item_matrix.iloc[6].values

## form 2 rating vectors over those overlapping movies
# from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([user1], [user2])[0]


array([0.4403668])

In [64]:
## create user-user similarity matrix
user_matrix = user_item_matrix.values
user_cosine_sim = cosine_similarity(user_matrix)
user_cosine_sim

array([[1.        , 0.16693098, 0.04745954, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.16693098, 1.        , 0.11059132, ..., 0.16148478, 0.17226781,
        0.10579788],
       [0.04745954, 0.11059132, 1.        , ..., 0.10124256, 0.13341615,
        0.02655587],
       ...,
       [0.14861694, 0.16148478, 0.10124256, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.17950788, 0.17226781, 0.13341615, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.39817474, 0.10579788, 0.02655587, ..., 0.09511958, 0.18246466,
        1.        ]], shape=(943, 943))

## Part 3: Find top k neighbors for a target user


In [66]:
# user1 -> target user

## look at all similar users to user1
sim_row = user_cosine_sim[0].copy()
sim_row[0] = -1.0  # exclude the user itself
sorted_idx = np.argsort(sim_row)[::-1]
top_k = 5
top_idx = sorted_idx[:top_k]
for rank, idx in enumerate(top_idx, start=1):
    print(
        f"{rank}. User ID: {user_item_matrix.index[idx]} — similarity: {sim_row[idx]:.4f}"
    )

1. User ID: 916 — similarity: 0.5691
2. User ID: 864 — similarity: 0.5475
3. User ID: 268 — similarity: 0.5421
4. User ID: 92 — similarity: 0.5405
5. User ID: 435 — similarity: 0.5387
