In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from multiprocessing import Pool, cpu_count

In [5]:
rated_movies = pd.read_csv('filmes.csv')
movies = pd.read_csv('TMDB_all_movies.csv')
info_movies = pd.read_csv('TMDB_movie_dataset_v11.csv')

In [6]:
movies = pd.merge(movies, info_movies, on = 'id', how = 'inner')

In [11]:
movies.head()

Unnamed: 0,id,title_x,director,runtime_x,release_date_x,adult,genres_y,keywords,overview_y,poster_path_y
0,2,Ariel,Aki Kaurismäki,73.0,1988-10-21,False,"Drama, Comedy, Romance","prison, underdog, helsinki, finland, factory w...",After the coal mine he works at closes and his...,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
1,3,Shadows in Paradise,Aki Kaurismäki,74.0,1986-10-17,False,"Drama, Comedy, Romance","helsinki, finland, salesclerk, garbage","Nikander, a rubbish collector and would-be ent...",/nj01hspawPof0mJmlgfjuLyJuRN.jpg
2,5,Four Rooms,"Quentin Tarantino, Robert Rodriguez, Allison A...",98.0,1995-12-09,False,Comedy,"hotel, new year's eve, witch, bet, sperm, hote...",It's Ted the Bellhop's first night on the job....,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg
3,6,Judgment Night,Stephen Hopkins,109.0,1993-10-15,False,"Action, Crime, Thriller","drug dealer, chicago, illinois, escape, one ni...","While racing to a boxing match, Frank, Mike, J...",/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg
4,8,Life in Loops (A Megacities RMX),Timo Novotny,80.0,2006-01-01,False,Documentary,megacities,Timo Novotny labels his new project an experim...,/x7Sz339F2oC8mBf0DHCQpKizXaL.jpg


In [12]:
movies = movies[['id','title_x','director','runtime_x','release_date_x' ,'adult','genres_y','keywords','overview_y','poster_path_y' ]]

In [13]:
movies.head()

Unnamed: 0,id,title_x,director,runtime_x,release_date_x,adult,genres_y,keywords,overview_y,poster_path_y
0,2,Ariel,Aki Kaurismäki,73.0,1988-10-21,False,"Drama, Comedy, Romance","prison, underdog, helsinki, finland, factory w...",After the coal mine he works at closes and his...,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
1,3,Shadows in Paradise,Aki Kaurismäki,74.0,1986-10-17,False,"Drama, Comedy, Romance","helsinki, finland, salesclerk, garbage","Nikander, a rubbish collector and would-be ent...",/nj01hspawPof0mJmlgfjuLyJuRN.jpg
2,5,Four Rooms,"Quentin Tarantino, Robert Rodriguez, Allison A...",98.0,1995-12-09,False,Comedy,"hotel, new year's eve, witch, bet, sperm, hote...",It's Ted the Bellhop's first night on the job....,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg
3,6,Judgment Night,Stephen Hopkins,109.0,1993-10-15,False,"Action, Crime, Thriller","drug dealer, chicago, illinois, escape, one ni...","While racing to a boxing match, Frank, Mike, J...",/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg
4,8,Life in Loops (A Megacities RMX),Timo Novotny,80.0,2006-01-01,False,Documentary,megacities,Timo Novotny labels his new project an experim...,/x7Sz339F2oC8mBf0DHCQpKizXaL.jpg


In [14]:
movies.rename(columns = {'title_x': 'title', 'runtime_x': 'runtime', 'release_date_x': 'year' , 'overview_y': 'overview', 'genres_y': 'genres', 'poster_path_y' : 'poster_path' }, inplace = True)

In [15]:
movies['year'] = pd.to_datetime(movies['year'], errors='coerce').dt.year


In [17]:
def format_year(year):
    try:
        if pd.isnull(year):
            return ""
        return str(int(float(year)))
    except Exception:
        return str(year)
    
movies['year'] = movies['year'].apply(format_year)    

In [18]:
movies = movies.drop_duplicates(subset=['title', 'year']).reset_index(drop=True)

In [19]:
movies['title_year'] = movies['title'] + ' (' + movies['year'].astype(str) + ')'

In [20]:
movies.head()

Unnamed: 0,id,title,director,runtime,year,adult,genres,keywords,overview,poster_path,title_year
0,2,Ariel,Aki Kaurismäki,73.0,1988,False,"Drama, Comedy, Romance","prison, underdog, helsinki, finland, factory w...",After the coal mine he works at closes and his...,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,Ariel (1988)
1,3,Shadows in Paradise,Aki Kaurismäki,74.0,1986,False,"Drama, Comedy, Romance","helsinki, finland, salesclerk, garbage","Nikander, a rubbish collector and would-be ent...",/nj01hspawPof0mJmlgfjuLyJuRN.jpg,Shadows in Paradise (1986)
2,5,Four Rooms,"Quentin Tarantino, Robert Rodriguez, Allison A...",98.0,1995,False,Comedy,"hotel, new year's eve, witch, bet, sperm, hote...",It's Ted the Bellhop's first night on the job....,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,Four Rooms (1995)
3,6,Judgment Night,Stephen Hopkins,109.0,1993,False,"Action, Crime, Thriller","drug dealer, chicago, illinois, escape, one ni...","While racing to a boxing match, Frank, Mike, J...",/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,Judgment Night (1993)
4,8,Life in Loops (A Megacities RMX),Timo Novotny,80.0,2006,False,Documentary,megacities,Timo Novotny labels his new project an experim...,/x7Sz339F2oC8mBf0DHCQpKizXaL.jpg,Life in Loops (A Megacities RMX) (2006)


In [21]:
rated_movies = rated_movies[['user_id', 'film_title', 'rating', 'film_year']]

In [22]:
rated_movies['film_year'] = rated_movies['film_year'].apply(format_year)    

In [23]:
rated_movies['title_year'] = rated_movies['film_title'] + ' (' + rated_movies['film_year'].astype(str) + ')'

In [24]:
rated_movies.head()

Unnamed: 0,user_id,film_title,rating,film_year,title_year
0,user_001,Scream 7,1.0,2026,Scream 7 (2026)
1,user_001,Materialists,,2025,Materialists (2025)
2,user_001,How to Train Your Dragon,6.0,2025,How to Train Your Dragon (2025)
3,user_001,Predator: Killer of Killers,8.0,2025,Predator: Killer of Killers (2025)
4,user_001,Echo Valley,6.0,2025,Echo Valley (2025)


In [25]:
movies = movies.fillna('')

movies['features'] = (
    movies['title'].astype(str) + ' ' +
    movies['director'].astype(str) + ' ' +
    movies['genres'].astype(str) + ' ' +
    movies['keywords'].astype(str) + ' ' +
    movies['overview'].astype(str) 
)

In [26]:
import re

def clean_text(text):
    if pd.isnull(text):
        return ""
    # Remove pontuação e coloca em minúsculas
    text = text.lower()
    text = re.sub(r'[.,]', '', text)
    return text

movies['features'] = movies['features'].apply(clean_text)

In [27]:
movie_name = "Titanic"
resultado = movies[movies['title'] == movie_name]

In [29]:
resultado

Unnamed: 0,id,title,director,runtime,year,adult,genres,keywords,overview,poster_path,title_year,features
471,597,Titanic,James Cameron,194.0,1997.0,False,"Drama, Romance","epic, ship, drowning, panic, shipwreck, evacua...",101-year-old Rose DeWitt Bukater tells the sto...,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,Titanic (1997),titanic james cameron drama romance epic ship ...
6052,11021,Titanic,"Werner Klingler, Herbert Selpin",88.0,1943.0,False,"Action, Drama, History","sea, captain, passenger, cruise, iceberg, tita...",This little-known German film retells the true...,/Al7oIXQ4dZAofBTZWm6OiXS3MEa.jpg,Titanic (1943),titanic werner klingler herbert selpin action ...
10006,16535,Titanic,Jean Negulesco,98.0,1953.0,False,"Drama, Romance",titanic,"Unhappily married, Julia Sturges decides to go...",/rEPzO9I6LCk6Mxg1X4BsBk6oA3V.jpg,Titanic (1953),titanic jean negulesco drama romance titanic u...
231685,357517,Titanic,Lutz Büscher,0.0,1984.0,False,Drama,,,/yi73me6Jl3zDelS9pQK5jtMRhsc.jpg,Titanic (1984),titanic lutz büscher drama
309328,455679,Titanic,,0.0,,False,,,,,Titanic (),titanic
536374,760524,Titanic,,0.0,2018.0,False,,,"""Titanic"" is a Punjabi feature film. It is fam...",,Titanic (2018),"titanic ""titanic"" is a punjabi feature film..."
784795,1124589,Titanic,Kim Harrington,55.0,2023.0,False,Documentary,,This documentary explores the incredible histo...,/l7rAG4P16SNFdPmWfTC8ZQydOon.jpg,Titanic (2023),titanic kim harrington documentary this docum...


In [30]:
movies.head()

Unnamed: 0,id,title,director,runtime,year,adult,genres,keywords,overview,poster_path,title_year,features
0,2,Ariel,Aki Kaurismäki,73.0,1988,False,"Drama, Comedy, Romance","prison, underdog, helsinki, finland, factory w...",After the coal mine he works at closes and his...,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,Ariel (1988),ariel aki kaurismäki drama comedy romance pris...
1,3,Shadows in Paradise,Aki Kaurismäki,74.0,1986,False,"Drama, Comedy, Romance","helsinki, finland, salesclerk, garbage","Nikander, a rubbish collector and would-be ent...",/nj01hspawPof0mJmlgfjuLyJuRN.jpg,Shadows in Paradise (1986),shadows in paradise aki kaurismäki drama comed...
2,5,Four Rooms,"Quentin Tarantino, Robert Rodriguez, Allison A...",98.0,1995,False,Comedy,"hotel, new year's eve, witch, bet, sperm, hote...",It's Ted the Bellhop's first night on the job....,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,Four Rooms (1995),four rooms quentin tarantino robert rodriguez ...
3,6,Judgment Night,Stephen Hopkins,109.0,1993,False,"Action, Crime, Thriller","drug dealer, chicago, illinois, escape, one ni...","While racing to a boxing match, Frank, Mike, J...",/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,Judgment Night (1993),judgment night stephen hopkins action crime th...
4,8,Life in Loops (A Megacities RMX),Timo Novotny,80.0,2006,False,Documentary,megacities,Timo Novotny labels his new project an experim...,/x7Sz339F2oC8mBf0DHCQpKizXaL.jpg,Life in Loops (A Megacities RMX) (2006),life in loops (a megacities rmx) timo novotny ...


In [None]:
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(movies['features'])

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(X)

In [None]:
def compute_distances(indices, X, knn, title_year, movies, n_recommendations):
    distances, indices = knn.kneighbors(X[indices], n_neighbors=n_recommendations+1)
    return [movies.iloc[i]['title_year'] for i in indices.flatten() if movies.iloc[i]['title_year'] != title_year]

In [27]:
def get_recommendations_by_content(title_year, n_recommendations=40):
    if title_year not in movies['title_year'].values:
        return []
    idx = movies[movies['title_year'] == title_year].index[0]
    distances, indices = knn.kneighbors(X[idx], n_neighbors=n_recommendations+1)
    recommended_title_years = [movies.iloc[i]['title_year'] for i in indices.flatten() if movies.iloc[i]['title_year'] != title_year]
    return recommended_title_years[:n_recommendations]

In [None]:
def get_recommendations_by_content_parallel(title_year, n_recommendations=80):
    if title_year not in movies['title_year'].values:
        return []

    idx = movies[movies['title_year'] == title_year].index[0]

    num_cores = cpu_count()
    indices_split = np.array_split(range(X.shape[0]), num_cores)

    with Pool(num_cores) as pool:
        results = pool.starmap(
            compute_distances,
            [(indices, X, knn, title_year, movies, n_recommendations) for indices in indices_split]
        )

    recommended_title_years = [item for sublist in results for item in sublist]
    return recommended_title_years[:n_recommendations]

In [28]:
def get_recommendations_by_ratings(title_year, n_recommendations=18):
    content_recs = get_recommendations_by_content(title_year, n_recommendations=140)
    if not content_recs:
        return []
    pivot = rated_movies.pivot_table(index='title_year', columns='user_id', values='rating').fillna(0)
    filtered_titles = [title_year] + [rec for rec in content_recs if rec in pivot.index]
    filtered_pivot = pivot.loc[filtered_titles]
    knn_ratings = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_ratings.fit(filtered_pivot.values)
    idx = filtered_pivot.index.get_loc(title_year)
    distances, indices = knn_ratings.kneighbors([filtered_pivot.iloc[idx].values], n_neighbors=min(n_recommendations+1, len(filtered_titles)))
    recommended_title_years = [filtered_pivot.index[i] for i in indices.flatten() if filtered_pivot.index[i] != title_year]
    return recommended_title_years[:n_recommendations]

In [None]:
def get_recommendations_by_ratings_parallel(title_year, n_recommendations=18):
    content_recs = get_recommendations_by_content_parallel(title_year, n_recommendations=140)
    if not content_recs:
        return []

    pivot = rated_movies.pivot_table(index='title_year', columns='user_id', values='rating').fillna(0)
    filtered_titles = [title_year] + [rec for rec in content_recs if rec in pivot.index]
    if len(filtered_titles) <= 1:
        return []

    filtered_pivot = pivot.loc[filtered_titles]

    num_cores = cpu_count()
    indices_split = np.array_split(range(filtered_pivot.shape[0]), num_cores)

    with Pool(num_cores) as pool:
        results = pool.starmap(
            compute_distances,
            [(indices, X, knn, title_year, movies, n_recommendations) for indices in indices_split]
        )

    recommended_title_years = [item for sublist in results for item in sublist]
    return recommended_title_years[:n_recommendations]

In [35]:
content_recs = {}

# Itera sobre todos os filmes no DataFrame `movies`
for title_year in movies['title_year']:
    try:
        # Obtém as recomendações por notas de usuários
        recommendations = get_recommendations_by_content_GPU(title_year, n_recommendations=60, batch_size=5000)
        content_recs [title_year] = recommendations
    except Exception as e:
        print(f"Erro ao calcular recomendações para {title_year}: {e}")
# Salva o dicionário em um arquivo pickle
with open('content_recs.pkl', 'wb') as f:
    pickle.dump(content_recs, f)

Using device: cuda


KeyboardInterrupt: 

In [None]:
rated_movies_with_ratings = rated_movies[rated_movies['rating'] > 0]
movies_with_ratings = movies[movies['title_year'].isin(rated_movies_with_ratings['title_year'])]

In [None]:
# Salvar recomendações por notas de usuários considerando conteúdo primeiro
rating_recs = {}

# Itera sobre todos os filmes no DataFrame `movies`
total_movies = len(movies_with_ratings['title_year'])
for title_year in movies_with_ratings['title_year']:
    try:
        # Obtém as recomendações por notas de usuários
        recommendations = get_recommendations_by_ratings_GPU(title_year, n_recommendations=18)
        rating_recs[title_year] = recommendations
    except Exception as e:
        print(f"Erro ao calcular recomendações para {title_year}: {e}")
    print(f"Processado {idx} de {total_movies} filmes")

# Salva o dicionário em um arquivo pickle
with open('rating_recs.pkl', 'wb') as f:
    pickle.dump(rating_recs, f)

Erro ao calcular recomendações para Ariel (1988): Unable to allocate 5.71 TiB for an array with shape (1078500, 728098) and data type int64
Erro ao calcular recomendações para Shadows in Paradise (1986): Unable to allocate 5.71 TiB for an array with shape (1078500, 728098) and data type int64
Erro ao calcular recomendações para Four Rooms (1995): Unable to allocate 5.71 TiB for an array with shape (1078500, 728098) and data type int64
Erro ao calcular recomendações para Judgment Night (1993): Unable to allocate 5.71 TiB for an array with shape (1078500, 728098) and data type int64
Erro ao calcular recomendações para Life in Loops (A Megacities RMX) (2006): Unable to allocate 5.71 TiB for an array with shape (1078500, 728098) and data type int64
Erro ao calcular recomendações para Sunday in August (2004): Unable to allocate 5.71 TiB for an array with shape (1078500, 728098) and data type int64
Erro ao calcular recomendações para Star Wars (1977): Unable to allocate 5.71 TiB for an array

KeyboardInterrupt: 

In [20]:
print(get_recommendations_by_content('Indiana Jones and the Last Crusade'))

['God Disposes', 'Eternal', 'The Old Cowboy', 'My Father Iqbal', 'Father and Son', 'Hitler: Beast of Berlin', 'Morning Star', 'Ghouls', 'The Cord of Life', 'The Han River', 'Hope in the Holy Land: Delving Beneath the Surface of the Israeli-Palestinian Conflict', 'Blood on the Asphalt', 'The Day of the Crows', 'Heart Beats of Long Ago', 'Consolation', 'Wind Back', 'The Grassland Whisper', 'The Devil in Sofia', 'No Mill No Meal', 'Won in the Fifth', "Majub's Journey", 'Der rote Reiter', 'The Misadventure of a French Gentleman Without Pants at the Zandvoort Beach', 'Glory of Legend', 'Premutos: The Fallen Angel', '家族ケチャップ', 'Fokak Meny', 'An Apple from Paradise', 'The Line Will Break', 'Tempest', 'The Stepmother', 'The Fisher-Maid', 'Summer in the Golden Valley', 'Traveler', 'The New Man', 'Youth on the palm of the imp', 'The Intruder', 'The Eremites', 'Rebels Of The Cities', 'Blood Loss', 'Alarm', 'The Old Man and the Bird', 'The Color of the Sun', 'Lost on the Branch', 'Jim is Fond of G

In [29]:
print(get_recommendations_by_ratings('Titanic (1997)', n_recommendations=6))

['A Night to Remember (1958)', 'Titanic: The Legend Goes On... (2000)', 'United (2011)', 'And the Ship Sails On (1983)', 'The Fabulous Baron Munchausen (1962)', 'Titanic: The Musical (2023)']


In [31]:
print(get_recommendations_by_ratings_GPU('Titanic (1997)', n_recommendations=6))

Using device: cuda
Using device: cuda


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:


# Salva o DataFrame movies em um arquivo pickle
with open('movies_info.pkl', 'wb') as f:
    pickle.dump(movies, f)
# Salva o DataFrame rated_movies em um arquivo pickle
with open('rated_movies.pkl', 'wb') as f:
    pickle.dump(rated_movies, f)    

In [30]:
import time

# Seleciona um subconjunto de filmes para medir o tempo médio
subset_movies = movies['title_year'][:10]  # Exemplo com 10 filmes
start_time = time.time()

for title_year in subset_movies:
    try:
        recommendations = get_recommendations_by_ratings(title_year, n_recommendations=18)
    except Exception as e:
        print(f"Erro ao calcular recomendações para {title_year}: {e}")

end_time = time.time()

# Calcula o tempo médio por filme
average_time_per_movie = (end_time - start_time) / len(subset_movies)
print(f"Tempo médio por filme: {average_time_per_movie:.2f} segundos")

# Previsão para todos os filmes
total_movies = len(movies['title_year'])
predicted_total_time = average_time_per_movie * total_movies
print(f"Tempo total estimado para {total_movies} filmes: {predicted_total_time:.2f} segundos")

Erro ao calcular recomendações para Life in Loops (A Megacities RMX) (2006): "['Life in Loops (A Megacities RMX) (2006)'] not in index"
Erro ao calcular recomendações para Sunday in August (2004): "['Sunday in August (2004)'] not in index"
Tempo médio por filme: 2.00 segundos
Tempo total estimado para 1078500 filmes: 2158134.14 segundos
