In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from numpy import hstack
from sklearn.metrics.pairwise import cosine_similarity
import random
import joblib
import numpy as np

In [2]:
df = pd.read_csv("movies_feature_engineered.csv")
df.head()

Unnamed: 0,movieId,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,...,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western
0,1,21.4021,81.0,7.969,18889.0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,3.0047,104.0,7.237,10783.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1.4596,101.0,6.5,398.0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,4,1.612,127.0,6.3,173.0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,5,2.0869,106.0,6.237,754.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Identify feature columns
numeric_features = ['popularity', 'runtime', 'vote_average', 'vote_count']
genre_features = df.columns.difference(numeric_features + ['movieId'])

#Scale numeric features between 0 and 1
scaler = MinMaxScaler()
scaled_numerics = scaler.fit_transform(df[numeric_features])

#Combine scaled numerics + genre features

final_features = hstack([
    scaled_numerics,
    df[genre_features].values
])
final_features

array([[0.28069888, 0.13597246, 0.89128733, ..., 0.        , 0.        ,
        0.        ],
       [0.03936073, 0.17555938, 0.80941729, ..., 0.        , 0.        ,
        0.        ],
       [0.01909202, 0.17039587, 0.72698803, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00469889, 0.16179002, 0.76121239, ..., 0.        , 0.        ,
        0.        ],
       [0.07968957, 0.15146299, 0.90851135, ..., 0.        , 0.        ,
        0.        ],
       [0.00215923, 0.14285714, 0.74566603, ..., 0.        , 0.        ,
        0.        ]])

In [4]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(final_features)

In [5]:
movie_ids = df['movieId'].values
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
index_to_movie_id = {idx: movie_id for idx, movie_id in enumerate(movie_ids)}

In [8]:
def recommend_similar_movies(movie_id, top_n=10, randomness_factor=2):
    """
    Recommend top_n similar movies to the given movie_id, with a bit of randomness.
    
    randomness_factor: how many more items to consider before picking top_n randomly
    """
    if movie_id not in movie_id_to_index:
        raise ValueError("Movie ID not found in the dataset.")
    
    idx = movie_id_to_index[movie_id]
    sim_scores = similarity_matrix[idx]

    # Get top (top_n * randomness_factor) indices, excluding self
    pool_size = top_n * randomness_factor
    top_pool_indices = sim_scores.argsort()[::-1]
    top_pool_indices = [i for i in top_pool_indices if i != idx][:pool_size]

    # Randomly pick top_n unique indices from the pool
    selected_indices = random.sample(top_pool_indices, top_n)

    # Extract movie IDs and actual similarity scores
    similar_movie_ids = [index_to_movie_id[i] for i in selected_indices]
    similar_scores = [round(sim_scores[i], 4) for i in selected_indices]

    return similar_movie_ids, similar_scores


In [9]:
recommend_similar_movies(1,10)

([87876, 135887, 2355, 97913, 69644, 45517, 86298, 170957, 157296, 62999],
 [0.9872,
  0.9887,
  0.9916,
  0.9942,
  0.9892,
  0.9963,
  0.9857,
  0.9857,
  0.9923,
  0.9861])

In [10]:
np.save("similarity_matrix.npy", similarity_matrix)
np.save("final_features.npy", final_features)
joblib.dump(index_to_movie_id, "index_to_movie_id.pkl")
joblib.dump(movie_id_to_index, "movie_id_to_index.pkl")

['movie_id_to_index.pkl']