In [21]:
# Imports
import pandas as pd
import numpy as np
import math
import re
import pickle

from scipy.sparse import csr_matrix, hstack

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
# Read dataset (extract only needed features)
df = pd.read_csv("resources/imdb_filtered.csv", usecols=[
    "title","vote_average","overview","genres","keywords","popularity"
])

df.dropna(subset=["title"])

# Analyze format of dataset
print(df.head())
print(df.info())

             title  vote_average  \
0        Inception         8.364   
1     Interstellar         8.417   
2  The Dark Knight         8.512   
3           Avatar         7.573   
4     The Avengers         7.710   

                                            overview  popularity  \
0  Cobb, a skilled thief who commits corporate es...      83.952   
1  The adventures of a group of explorers who mak...     140.241   
2  Batman raises the stakes in his war on crime. ...     130.643   
3  In the 22nd century, a paraplegic Marine is di...      79.932   
4  When an unexpected enemy emerges and threatens...      98.082   

                                        genres  \
0           Action, Science Fiction, Adventure   
1            Adventure, Drama, Science Fiction   
2               Drama, Action, Crime, Thriller   
3  Action, Adventure, Fantasy, Science Fiction   
4           Science Fiction, Action, Adventure   

                                            keywords  
0  rescue, mission

In [23]:
# Force deep copy of DataFrame
# NOTE: To future self idk why this is needed but it breaks without it so ....
df = df.copy()


## ---- GENRES ----

# Format genres and keywords column column
df['genres'] = df['genres'].fillna('')
df['genres'] = df['genres'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

# Split genres and apply One-Hot Encoding
mlb_genres = MultiLabelBinarizer()
genre_features = mlb_genres.fit_transform(df['genres'])

# Convert to a sparse dataset
genre_df_sparse = csr_matrix(genre_features)


In [24]:
## ---- KEYWORDS ----

# Format keywords columns
df['keywords'] = df['keywords'].fillna('')
df['keywords'] = df['keywords'].apply(
    lambda x: ' '.join(x.split(',')).strip() if isinstance(x, str) else ''
)

# Apply TF IDF Vectorization to keywords
keyword_vectorizer = TfidfVectorizer(max_features=5000)
keyword_features = keyword_vectorizer.fit_transform(df['keywords'])

# Convert to sparse dataset
keyword_df_sparse = csr_matrix(keyword_features)

In [25]:
## ---- VOTE AVERAGE (STANDARDIZED) ----

# Extract vote_average as a 2D array
vote_df = df['vote_average'].values.reshape(-1, 1)

# Scale the numeric feature
scaler = StandardScaler()
vote_scaled = scaler.fit_transform(vote_df)

# Convert to sparse matrix
vote_df_sparse = csr_matrix(vote_scaled)

In [None]:
## ---- WEIGHTINGS + FEATURE STACK ----

# Apply weightings to features
weights = {
    "genres": 1.0,
    "keywords": 2.0,
    "vote_average": 0.3,
}

# Compine all features into one dataframe
df_features = hstack([
    genre_df_sparse.multiply(weights["genres"]),
    vote_df_sparse.multiply(weights["vote_average"]), 
    keyword_df_sparse.multiply(weights["keywords"])
])

In [27]:
# Train a KNN model based on the features
movie_knn = NearestNeighbors(n_neighbors=5, metric='cosine')
movie_knn.fit(df_features)

In [28]:
## ---- EXPORTING MODEL TO DISK ----

with open ("models/movie_recommender.pkl", "wb") as f:
    pickle.dump({
        "model": movie_knn,
        "features": df_features,
        "df": df,
        "scaler": scaler,
        "genre_encoder": mlb_genres,
        "tfidf_keywords": keyword_vectorizer  
    }, f)

print("Movie model built successfully as \"movie_recommender.pkl\"")

Movie model built successfully as "movie_recommender.pkl"


In [None]:
### DEBUGGING TESTING
movie_name = "Star Wars: The Force Awakens"
n_recommendations = 20

# Check if movie exists in database
if movie_name not in df['title'].values:
    print(f"Movie: {movie_name} not found in dataset")

else:
    
    # Extract its index from the database
    movie_index = df[df['title'] == movie_name].index[0]

    # Use KNN model to find similar movies
    distances, indices = movie_knn.kneighbors(df_features[movie_index], n_neighbors=n_recommendations+1)

    # Output recommended movies
    recommended_movies = df.iloc[indices[0][1:]]['title'].tolist()
    print(recommended_movies)
