In [1]:
import pandas as pd
import numpy as np  
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

In [2]:
import os
print(os.getcwd())

d:\21_machine_learning_projects\tmdb-recommender


In [3]:
# os.chdir('tmdb-recommender')

In [4]:
movies = pd.read_csv(r'.\data\tmdb_5000_movies.csv')
credits = pd.read_csv(r'.\data\tmdb_5000_credits.csv')  

In [5]:
movies.shape, credits.shape

((4803, 20), (4803, 4))

In [6]:
movies = movies.merge(credits, on='title')

In [7]:
# 'crew' in movies.columns
movies = movies[['movie_id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']]
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   genres    4809 non-null   object
 3   keywords  4809 non-null   object
 4   overview  4806 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [8]:
movies.dropna(inplace=True)
movies.isna().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    0
cast        0
crew        0
dtype: int64

In [9]:
movies.genres.iloc[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [10]:
def extract_names(genre_str):
  genres = json.loads(genre_str)
  return [genre['name'] for genre in genres]


In [11]:
movies['genres'] = movies['genres'].apply(extract_names)
movies['genres'].sample()

207    [Action, Adventure, Science Fiction]
Name: genres, dtype: object

In [12]:
movies['keywords'] = movies['keywords'].apply(extract_names)

In [13]:
def extract_top3_cast(cast_str):
    cast_list = json.loads(cast_str)
    top3_cast = [cast['character'] for cast in cast_list[:3]]
    return top3_cast

In [14]:
movies['cast'] = movies['cast'].apply(extract_top3_cast)
movies['cast'].sample()

1627    [Lt. Parker Barnes, SID 6.7, Madison Carter]
Name: cast, dtype: object

In [15]:
def director(dir_name):
    crew_list = json.loads(dir_name)  # Convert string to list of dictionaries
    directors = [member['name'] for member in crew_list if member.get('job') == 'Director']  # Extract names of directors
    return directors

movies['crew'] = movies['crew'].apply(director)
movies['crew'].sample()

670    [Sydney Pollack]
Name: crew, dtype: object

In [16]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

In [17]:
movies['tags'] = movies['genres'] + movies['cast'] + movies['crew'] + movies['keywords']

In [18]:
new_df = movies[['movie_id', 'title', 'tags']]
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, J..."


In [19]:
credits[credits['title'] == 'Avatar'].values[0]

array([19995, 'Avatar',
       '[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48

In [20]:
new_df['tags'] = movies['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = movies['tags'].apply(lambda x: " ".join(x))


In [21]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


vectorization

In [22]:
ps = PorterStemmer()

In [23]:
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [24]:
# now apply this function to our corpus of text
new_df['tags'] = new_df['tags'].apply(stem)
new_df['tags']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


0       action adventur fantasi sciencefict jakesulli ...
1       adventur fantasi action captainjacksparrow wil...
2       action adventur crime jamesbond blofeld madele...
3       action crime drama thriller brucewayne/batman ...
4       action adventur sciencefict johncart dejahthor...
                              ...                        
4804    action crime thriller elmariachi bigotón mauri...
4805          comedi romanc buzzi linda marsha edwardburn
4806    comedi drama romanc tvmovi olivero’tool shanem...
4807                         sam amanda donald danielhsia
4808    documentari herself himself himself brianherzl...
Name: tags, Length: 4806, dtype: object

In [25]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [26]:
vectors = cv.fit_transform(new_df['tags']).toarray()
len(vectors)

4806

In [27]:
for word in cv.get_feature_names_out():
    print(word)

11
16thcenturi
17thcenturi
18thcenturi
1910
1930
1940
1950
1960
1970
1980
1990
1995
19thcenturi
1stlt
21stcenturi
3d
aaron
aaronseltz
abbey
abbi
aborigin
abrahamlincoln
abram
absurd
abus
abusivehusband
accid
account
action
actionhero
activist
actor
actress
adam
adammckay
adamshankman
adapt
addict
adolesc
adolfhitl
adopt
adoptedchild
adoptivefath
adrianlyn
adult
adultanim
adulteri
advanc
adventur
adversari
advertisingexecut
advertisingexpert
advic
affair
affect
afghanistan
africa
africanamerican
aftercreditssting
afterlif
age
agediffer
agent
aggressionbyanim
aid
airforc
airplan
airplanecrash
airport
al
alanpark
alaska
albert
alberthugh
alcatraz
alcohol
alcoholabus
alejandroamenábar
alejandrogonzáleziñárritu
alex
alexanderpayn
alexandreaja
alexcross
alexkendrick
alexproya
alfonsocuarón
alfredhitchcock
ali
alic
alien
alienabduct
alienattack
aliencontact
alieninfect
alieninvas
alienlife
alienparasit
alienphenomenon
alienplanet
alienrac
alik
alison
allanquatermain
allegori
allenhugh
alli
al

In [28]:
similarity = cosine_similarity(vectors)

In [29]:
pickle.dump(similarity, open('data/similarity.pkl', 'wb'))
pickle.dump(new_df, open('data/movies.pkl', 'wb'))

In [30]:
os.getcwd()

'd:\\21_machine_learning_projects\\tmdb-recommender'

In [31]:
# print("API Key:", os.getenv('OPENROUTER_API_KEY'))

In [None]:
import os 

: 