In [None]:
import pandas as pd
import numpy as np  
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

In [None]:
import os
print(os.getcwd())

In [None]:
# os.chdir('tmdb-recommender')

In [None]:
movies = pd.read_csv(r'.\data\tmdb_5000_movies.csv')
credits = pd.read_csv(r'.\data\tmdb_5000_credits.csv')  

In [None]:
movies.shape, credits.shape

In [None]:
movies = movies.merge(credits, on='title')

In [None]:
# 'crew' in movies.columns
movies = movies[['movie_id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']]
movies.info()

In [None]:
movies.dropna(inplace=True)
movies.isna().sum()

In [None]:
movies.genres.iloc[0]

In [None]:
def extract_names(genre_str):
  genres = json.loads(genre_str)
  return [genre['name'] for genre in genres]


In [None]:
movies['genres'] = movies['genres'].apply(extract_names)
movies['genres'].sample()

In [None]:
movies['keywords'] = movies['keywords'].apply(extract_names)

In [None]:
def extract_top3_cast(cast_str):
    cast_list = json.loads(cast_str)
    top3_cast = [cast['character'] for cast in cast_list[:3]]
    return top3_cast

In [None]:
movies['cast'] = movies['cast'].apply(extract_top3_cast)
movies['cast'].sample()

In [None]:
def director(dir_name):
    crew_list = json.loads(dir_name)  # Convert string to list of dictionaries
    directors = [member['name'] for member in crew_list if member.get('job') == 'Director']  # Extract names of directors
    return directors

movies['crew'] = movies['crew'].apply(director)
movies['crew'].sample()

In [None]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
movies['tags'] = movies['genres'] + movies['cast'] + movies['crew'] + movies['keywords']

In [None]:
new_df = movies[['movie_id', 'title', 'tags']]
new_df.head(1)

In [None]:
credits[credits['title'] == 'Avatar'].values[0]

In [None]:
new_df['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

vectorization

In [None]:
ps = PorterStemmer()

In [None]:
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
# now apply this function to our corpus of text
new_df['tags'] = new_df['tags'].apply(stem)
new_df['tags']

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()
len(vectors)

In [None]:
for word in cv.get_feature_names_out():
    print(word)

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
pickle.dump(similarity, open('data/similarity.pkl', 'wb'))
pickle.dump(new_df, open('data/movies.pkl', 'wb'))