In [161]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from numpy import hstack
from sklearn.metrics.pairwise import cosine_similarity
import random
import joblib
import numpy as np

In [162]:
df = pd.read_csv("movies_feature_engineered.csv")
df.head()

Unnamed: 0,movieId,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,...,MX,RU,SE,NZ,CH,IN,AT,NL,BE,BR
0,1,21.4021,81.0,7.969,18889.0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,3.0047,104.0,7.237,10783.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1.4596,101.0,6.5,398.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,1.612,127.0,6.3,173.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,2.0869,106.0,6.237,754.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [207]:
#Identify feature columns
numeric_features = ['popularity', 'runtime', 'vote_average', 'vote_count']
genre_features = df.columns.difference(numeric_features + ['movieId'])

#Scale numeric features between 0 and 1
scaler = StandardScaler()
scaled_numerics = scaler.fit_transform(df[numeric_features])

#Combine scaled numerics + genre features

final_features = hstack([
    scaled_numerics,
    df[genre_features].values
])
final_features

array([[ 4.87546963, -0.96165432,  1.70808733, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0198783 , -0.01333186,  0.84822861, ...,  1.        ,
         0.        ,  0.        ],
       [-0.38791705, -0.13702609, -0.01750346, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.677499  , -0.34318315,  0.34194568, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.83127254, -0.59057162,  1.88898657, ...,  0.        ,
         0.        ,  0.        ],
       [-0.72859548, -0.79672868,  0.1786665 , ...,  0.        ,
         0.        ,  0.        ]])

In [208]:
final_features.shape

(9742, 76)

In [209]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(final_features)

In [196]:
movie_ids = df['movieId'].values
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
index_to_movie_id = {idx: movie_id for idx, movie_id in enumerate(movie_ids)}

In [197]:
def recommend_similar_movies(movie_id, top_n=10, randomness_factor=2):
    """
    Recommend top_n similar movies to the given movie_id, with a bit of randomness.
    
    randomness_factor: how many more items to consider before picking top_n randomly
    """
    if movie_id not in movie_id_to_index:
        raise ValueError("Movie ID not found in the dataset.")
    
    idx = movie_id_to_index[movie_id]
    sim_scores = similarity_matrix[idx]

    # Get top (top_n * randomness_factor) indices, excluding self
    pool_size = top_n * randomness_factor
    top_pool_indices = sim_scores.argsort()[::-1]
    top_pool_indices = [i for i in top_pool_indices if i != idx][:pool_size]

    # Randomly pick top_n unique indices from the pool
    selected_indices = random.sample(top_pool_indices, top_n)

    # Extract movie IDs and actual similarity scores
    similar_movie_ids = [index_to_movie_id[i] for i in selected_indices]
    similar_scores = [round(sim_scores[i], 4) for i in selected_indices]

    return similar_movie_ids, similar_scores


In [217]:
recommend_similar_movies(3114,10,1)

([152081, 177765, 4886, 5218, 6377, 1, 50872, 4306, 115617, 68954],
 [0.964,
  0.9442,
  0.9899,
  0.9448,
  0.9503,
  0.9844,
  0.9743,
  0.9513,
  0.9529,
  0.9443])

In [218]:
np.save("final_features.npy", final_features)
joblib.dump(index_to_movie_id, "index_to_movie_id.pkl")
joblib.dump(movie_id_to_index, "movie_id_to_index.pkl")

['movie_id_to_index.pkl']